Keyword search (#88)

* Add keyword search support

* Fix filters display

* Make documents appear immediately
This commit is contained in:
Chris Weaver 2023-06-05 22:25:15 -07:00 committed by GitHub
parent e202aa440e
commit e0ebdc2fc1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 442 additions and 282 deletions

View File

@ -123,6 +123,7 @@ def get_application() -> FastAPI:
logger.info("Verifying query preprocessing (NLTK) data is downloaded")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
logger.info("Verifying public credential exists.")
create_initial_public_credential()

View File

@ -6,7 +6,9 @@ import { DISABLE_AUTH } from "@/lib/constants";
import { HealthCheckBanner } from "@/components/health/healthcheck";
import { ApiKeyModal } from "@/components/openai/ApiKeyModal";
import { buildUrl } from "@/lib/utilsSS";
import { User } from "@/lib/types";
import { Connector, User } from "@/lib/types";
import { cookies } from "next/headers";
import { SearchType } from "@/components/search/SearchTypeSelector";
export default async function Home() {
const tasks = [
@ -24,13 +26,23 @@ export default async function Home() {
return redirect("/auth/login");
}
let connectors = null;
let connectors: Connector<any>[] = [];
if (connectorsResponse.ok) {
connectors = await connectorsResponse.json();
} else {
console.log(`Failed to fetch connectors - ${connectorsResponse.status}`);
}
// needs to be done in a non-client side component due to nextjs
const storedSearchType = cookies().get("searchType")?.value as
| keyof typeof SearchType
| undefined;
let searchTypeDefault: SearchType =
storedSearchType !== undefined &&
SearchType.hasOwnProperty(storedSearchType)
? SearchType[storedSearchType]
: SearchType.SEMANTIC; // default to semantic search
return (
<>
<Header user={user} />
@ -40,7 +52,10 @@ export default async function Home() {
<ApiKeyModal />
<div className="px-24 pt-10 flex flex-col items-center min-h-screen bg-gray-900 text-gray-100">
<div className="w-full">
<SearchSection connectors={connectors} />
<SearchSection
connectors={connectors}
defaultSearchType={searchTypeDefault}
/>
</div>
</div>
</>

View File

@ -1,8 +1,8 @@
import React from "react";
import { Source } from "./interfaces";
import { getSourceIcon } from "../source";
import { Funnel } from "@phosphor-icons/react";
import { ValidSources } from "@/lib/types";
import { Source } from "@/lib/search/interfaces";
const sources: Source[] = [
{ displayName: "Google Drive", internalName: "google_drive" },
@ -34,31 +34,33 @@ export function SourceSelector({
};
return (
<div className="bg-gray-900 p-6">
<div className="flex mb-3 mx-2">
<div className="bg-gray-900 px-6">
<div className="flex mb-2 pb-1 pl-2 border-b border-gray-800 mx-2">
<h2 className="font-bold my-auto">Filters</h2>
<Funnel className="my-auto ml-2" size="20" />
</div>
{sources
.filter((source) => existingSources.includes(source.internalName))
.map((source) => (
<div
key={source.internalName}
className={
"flex cursor-pointer w-full items-center text-white " +
"py-1.5 my-1.5 rounded-lg px-2 " +
(selectedSources.includes(source)
? "bg-gray-700"
: "hover:bg-gray-800")
}
onClick={() => handleSelect(source)}
>
{getSourceIcon(source.internalName, "16")}
<span className="ml-2 text-sm text-gray-200">
{source.displayName}
</span>
</div>
))}
<div className="px-2">
{sources
.filter((source) => existingSources.includes(source.internalName))
.map((source) => (
<div
key={source.internalName}
className={
"flex cursor-pointer w-full items-center text-white " +
"py-1.5 my-1.5 rounded-lg px-2 " +
(selectedSources.includes(source)
? "bg-gray-700"
: "hover:bg-gray-800")
}
onClick={() => handleSelect(source)}
>
{getSourceIcon(source.internalName, "16")}
<span className="ml-2 text-sm text-gray-200">
{source.displayName}
</span>
</div>
))}
</div>
</div>
);
}

View File

@ -26,7 +26,7 @@ export const SearchBar: React.FC<SearchBarProps> = ({ onSearch }) => {
};
return (
<div className="flex justify-center py-4">
<div className="flex justify-center py-3">
<div className="flex items-center w-full border-2 border-gray-600 rounded px-4 py-2 focus-within:border-blue-500">
<MagnifyingGlass className="text-gray-400" />
<textarea

View File

@ -1,12 +1,17 @@
import React from "react";
import { Quote, Document, SearchResponse } from "./types";
import { getSourceIcon } from "../source";
import { LoadingAnimation } from "../Loading";
import { InfoIcon } from "../icons/icons";
import {
DanswerDocument,
SearchResponse,
Quote,
} from "@/lib/search/interfaces";
import { SearchType } from "./SearchTypeSelector";
const removeDuplicateDocs = (documents: Document[]) => {
const removeDuplicateDocs = (documents: DanswerDocument[]) => {
const seen = new Set<string>();
const output: Document[] = [];
const output: DanswerDocument[] = [];
documents.forEach((document) => {
if (
document.semantic_identifier &&
@ -62,54 +67,58 @@ export const SearchResultsDisplay: React.FC<SearchResultsDisplayProps> = ({
return (
<>
{answer && (
<div className="p-4 border-2 rounded-md border-gray-700">
<div className="flex mb-1">
<h2 className="text font-bold my-auto">AI Answer</h2>
</div>
<p className="mb-4">{answer}</p>
<div className="h-56">
<div className="p-4 border-2 rounded-md border-gray-700">
<div className="flex mb-1">
<h2 className="text font-bold my-auto">AI Answer</h2>
</div>
<p className="mb-4">{answer}</p>
{quotes !== null && (
<>
<h2 className="text-sm font-bold mb-2">Sources</h2>
{isFetching && dedupedQuotes.length === 0 ? (
<LoadingAnimation text="Finding quotes" size="text-sm" />
) : (
<div className="flex">
{dedupedQuotes.map((quoteInfo) => (
<a
key={quoteInfo.document_id}
className="p-2 ml-1 border border-gray-800 rounded-lg text-sm flex max-w-[280px] hover:bg-gray-800"
href={quoteInfo.link}
target="_blank"
rel="noopener noreferrer"
>
{getSourceIcon(quoteInfo.source_type, "20")}
<p className="truncate break-all ml-2">
{quoteInfo.semantic_identifier || quoteInfo.document_id}
</p>
</a>
))}
</div>
)}
</>
)}
</div>
)}
{!answer && !isFetching && (
<div className="flex">
<InfoIcon
size="20"
className="text-red-500 my-auto flex flex-shrink-0"
/>
<div className="text-red-500 text-xs my-auto ml-1">
GPT hurt itself in its confusion :(
{quotes !== null && (
<>
<h2 className="text-sm font-bold mb-2">Sources</h2>
{isFetching && dedupedQuotes.length === 0 ? (
<LoadingAnimation text="Finding quotes" size="text-sm" />
) : (
<div className="flex">
{dedupedQuotes.map((quoteInfo) => (
<a
key={quoteInfo.document_id}
className="p-2 ml-1 border border-gray-800 rounded-lg text-sm flex max-w-[280px] hover:bg-gray-800"
href={quoteInfo.link}
target="_blank"
rel="noopener noreferrer"
>
{getSourceIcon(quoteInfo.source_type, "20")}
<p className="truncate break-all ml-2">
{quoteInfo.semantic_identifier ||
quoteInfo.document_id}
</p>
</a>
))}
</div>
)}
</>
)}
</div>
</div>
)}
{/* Only display docs once we're done fetching to avoid distracting from the AI answer*/}
{!isFetching && documents && documents.length > 0 && (
{!answer &&
!isFetching &&
searchResponse.searchType === SearchType.SEMANTIC && (
<div className="flex">
<InfoIcon
size="20"
className="text-red-500 my-auto flex flex-shrink-0"
/>
<div className="text-red-500 text-xs my-auto ml-1">
GPT hurt itself in its confusion :(
</div>
</div>
)}
{documents && documents.length > 0 && (
<div className="mt-4">
<div className="font-bold border-b mb-4 pb-1 border-gray-800">
Results

View File

@ -3,169 +3,27 @@
import { useState } from "react";
import { SearchBar } from "./SearchBar";
import { SearchResultsDisplay } from "./SearchResultsDisplay";
import { Quote, Document, SearchResponse } from "./types";
import { SourceSelector } from "./Filters";
import { Source } from "./interfaces";
import { Connector } from "@/lib/types";
const initialSearchResponse: SearchResponse = {
answer: null,
quotes: null,
documents: null,
};
const processSingleChunk = (
chunk: string,
currPartialChunk: string | null
): [{ [key: string]: any } | null, string | null] => {
const completeChunk = chunk + (currPartialChunk || "");
try {
// every complete chunk should be valid JSON
const chunkJson = JSON.parse(chunk);
return [chunkJson, null];
} catch (err) {
// if it's not valid JSON, then it's probably an incomplete chunk
return [null, completeChunk];
}
};
const processRawChunkString = (
rawChunkString: string,
previousPartialChunk: string | null
): [any[], string | null] => {
/* This is required because, in practice, we see that nginx does not send over
each chunk one at a time even with buffering turned off. Instead,
chunks are sometimes in batches or are sometimes incomplete */
if (!rawChunkString) {
return [[], null];
}
const chunkSections = rawChunkString
.split("\n")
.filter((chunk) => chunk.length > 0);
let parsedChunkSections: any[] = [];
let currPartialChunk = previousPartialChunk;
chunkSections.forEach((chunk) => {
const [processedChunk, partialChunk] = processSingleChunk(
chunk,
currPartialChunk
);
if (processedChunk) {
parsedChunkSections.push(processedChunk);
} else {
currPartialChunk = partialChunk;
}
});
return [parsedChunkSections, currPartialChunk];
};
interface SearchRequestStreamedArgs {
query: string;
sources: Source[];
updateCurrentAnswer: (val: string) => void;
updateQuotes: (quotes: Record<string, Quote>) => void;
updateDocs: (docs: Document[]) => void;
}
const searchRequestStreamed = async ({
query,
sources,
updateCurrentAnswer,
updateQuotes,
updateDocs,
}: SearchRequestStreamedArgs) => {
let answer = "";
let quotes: Record<string, Quote> | null = null;
let relevantDocuments: Document[] | null = null;
try {
const response = await fetch("/api/stream-direct-qa", {
method: "POST",
body: JSON.stringify({
query,
collection: "danswer_index",
...(sources.length > 0
? {
filters: [
{
source_type: sources.map((source) => source.internalName),
},
],
}
: {}),
}),
headers: {
"Content-Type": "application/json",
},
});
const reader = response.body?.getReader();
const decoder = new TextDecoder("utf-8");
let previousPartialChunk = null;
while (true) {
const rawChunk = await reader?.read();
if (!rawChunk) {
throw new Error("Unable to process chunk");
}
const { done, value } = rawChunk;
if (done) {
break;
}
// Process each chunk as it arrives
const [completedChunks, partialChunk] = processRawChunkString(
decoder.decode(value, { stream: true }),
previousPartialChunk
);
if (!completedChunks.length && !partialChunk) {
break;
}
if (partialChunk) {
previousPartialChunk = partialChunk;
}
completedChunks.forEach((chunk) => {
// TODO: clean up response / this logic
const answerChunk = chunk.answer_data;
if (answerChunk) {
answer += answerChunk;
updateCurrentAnswer(answer);
} else if (chunk.answer_finished) {
// set quotes as non-null to signify that the answer is finished and
// we're now looking for quotes
updateQuotes({});
if (
!answer.endsWith(".") &&
!answer.endsWith("?") &&
!answer.endsWith("!")
) {
answer += ".";
updateCurrentAnswer(answer);
}
} else {
if (Object.hasOwn(chunk, "top_documents")) {
const docs = chunk.top_documents as any[] | null;
if (docs) {
relevantDocuments = docs.map(
(doc) => JSON.parse(doc) as Document
);
updateDocs(relevantDocuments);
}
} else {
quotes = chunk as Record<string, Quote>;
updateQuotes(quotes);
}
}
});
}
} catch (err) {
console.error("Fetch error:", err);
}
return { answer, quotes, relevantDocuments };
};
import { SearchType, SearchTypeSelector } from "./SearchTypeSelector";
import {
DanswerDocument,
Quote,
SearchResponse,
Source,
} from "@/lib/search/interfaces";
import { aiSearchRequestStreamed } from "@/lib/search/ai";
import Cookies from "js-cookie";
interface SearchSectionProps {
connectors: Connector<any>[];
defaultSearchType: SearchType;
}
export const SearchSection: React.FC<SearchSectionProps> = ({ connectors }) => {
export const SearchSection: React.FC<SearchSectionProps> = ({
connectors,
defaultSearchType,
}) => {
// Search
const [searchResponse, setSearchResponse] = useState<SearchResponse | null>(
null
@ -175,47 +33,76 @@ export const SearchSection: React.FC<SearchSectionProps> = ({ connectors }) => {
// Filters
const [sources, setSources] = useState<Source[]>([]);
// Search Type
const [selectedSearchType, setSelectedSearchType] =
useState<SearchType>(defaultSearchType);
// helpers
const initialSearchResponse: SearchResponse = {
answer: null,
quotes: null,
documents: null,
searchType: selectedSearchType,
};
const updateCurrentAnswer = (answer: string) =>
setSearchResponse((prevState) => ({
...(prevState || initialSearchResponse),
answer,
}));
const updateQuotes = (quotes: Record<string, Quote>) =>
setSearchResponse((prevState) => ({
...(prevState || initialSearchResponse),
quotes,
}));
const updateDocs = (documents: DanswerDocument[]) =>
setSearchResponse((prevState) => ({
...(prevState || initialSearchResponse),
documents,
}));
return (
<div className="relative max-w-[1500px] mx-auto">
<div className="absolute left-0 ml-24 hidden 2xl:block">
<SourceSelector
selectedSources={sources}
setSelectedSources={setSources}
existingSources={connectors.map((connector) => connector.source)}
/>
{connectors.length > 0 && (
<SourceSelector
selectedSources={sources}
setSelectedSources={setSources}
existingSources={connectors.map((connector) => connector.source)}
/>
)}
</div>
<div className="w-[800px] mx-auto">
<SearchTypeSelector
selectedSearchType={selectedSearchType}
setSelectedSearchType={(searchType) => {
Cookies.set("searchType", searchType);
setSelectedSearchType(searchType);
}}
/>
<SearchBar
onSearch={(query) => {
onSearch={async (query) => {
setIsFetching(true);
setSearchResponse({
answer: null,
quotes: null,
documents: null,
searchType: selectedSearchType,
});
searchRequestStreamed({
await aiSearchRequestStreamed({
query,
sources,
updateCurrentAnswer: (answer) =>
setSearchResponse((prevState) => ({
...(prevState || initialSearchResponse),
answer,
})),
updateQuotes: (quotes) =>
setSearchResponse((prevState) => ({
...(prevState || initialSearchResponse),
quotes,
})),
updateDocs: (documents) =>
setSearchResponse((prevState) => ({
...(prevState || initialSearchResponse),
documents,
})),
}).then(() => {
setIsFetching(false);
updateCurrentAnswer,
updateQuotes,
updateDocs,
searchType: selectedSearchType,
});
setIsFetching(false);
}}
/>
<div className="mt-2">
<SearchResultsDisplay
searchResponse={searchResponse}

View File

@ -0,0 +1,46 @@
const defaultStyle =
"py-1 px-2 border rounded border-gray-700 cursor-pointer font-bold ";
export enum SearchType {
SEMANTIC = "SEMANTIC",
KEYWORD = "KEYWORD",
}
interface Props {
selectedSearchType: SearchType;
setSelectedSearchType: (searchType: SearchType) => void;
}
export const SearchTypeSelector: React.FC<Props> = ({
selectedSearchType,
setSelectedSearchType,
}) => {
return (
<div className="flex text-xs">
<div
className={
defaultStyle +
(selectedSearchType === SearchType.SEMANTIC
? "bg-blue-500"
: "bg-gray-800 hover:bg-gray-600")
}
onClick={() => setSelectedSearchType(SearchType.SEMANTIC)}
>
AI Search
</div>
<div
className={
defaultStyle +
"ml-2 " +
(selectedSearchType === SearchType.KEYWORD
? "bg-blue-500"
: "bg-gray-800 hover:bg-gray-600")
}
onClick={() => setSelectedSearchType(SearchType.KEYWORD)}
>
Keyword Search
</div>
</div>
);
};

View File

@ -1,6 +0,0 @@
import { ValidSources } from "@/lib/types";
export interface Source {
displayName: string;
internalName: ValidSources;
}

View File

@ -1,23 +0,0 @@
import { ValidSources } from "@/lib/types";
export interface Quote {
document_id: string;
link: string;
source_type: ValidSources;
blurb: string;
semantic_identifier: string | null;
}
export interface Document {
document_id: string;
link: string;
source_type: ValidSources;
blurb: string;
semantic_identifier: string | null;
}
export interface SearchResponse {
answer: string | null;
quotes: Record<string, Quote> | null;
documents: Document[] | null;
}

View File

@ -3,3 +3,5 @@ export const INTERNAL_URL = process.env.INTERNAL_URL || "http://127.0.0.1:8080";
export const GOOGLE_DRIVE_AUTH_IS_ADMIN_COOKIE_NAME =
"google_drive_auth_is_admin";
export const SEARCH_TYPE_COOKIE_NAME = "search_type";

143
web/src/lib/search/ai.ts Normal file
View File

@ -0,0 +1,143 @@
import { SearchType } from "@/components/search/SearchTypeSelector";
import { DanswerDocument, Quote, SearchRequestArgs } from "./interfaces";
const processSingleChunk = (
chunk: string,
currPartialChunk: string | null
): [{ [key: string]: any } | null, string | null] => {
const completeChunk = chunk + (currPartialChunk || "");
try {
// every complete chunk should be valid JSON
const chunkJson = JSON.parse(chunk);
return [chunkJson, null];
} catch (err) {
// if it's not valid JSON, then it's probably an incomplete chunk
return [null, completeChunk];
}
};
const processRawChunkString = (
rawChunkString: string,
previousPartialChunk: string | null
): [any[], string | null] => {
/* This is required because, in practice, we see that nginx does not send over
each chunk one at a time even with buffering turned off. Instead,
chunks are sometimes in batches or are sometimes incomplete */
if (!rawChunkString) {
return [[], null];
}
const chunkSections = rawChunkString
.split("\n")
.filter((chunk) => chunk.length > 0);
let parsedChunkSections: any[] = [];
let currPartialChunk = previousPartialChunk;
chunkSections.forEach((chunk) => {
const [processedChunk, partialChunk] = processSingleChunk(
chunk,
currPartialChunk
);
if (processedChunk) {
parsedChunkSections.push(processedChunk);
} else {
currPartialChunk = partialChunk;
}
});
return [parsedChunkSections, currPartialChunk];
};
export const aiSearchRequestStreamed = async ({
query,
sources,
updateCurrentAnswer,
updateQuotes,
updateDocs,
searchType,
}: SearchRequestArgs) => {
let answer = "";
let quotes: Record<string, Quote> | null = null;
let relevantDocuments: DanswerDocument[] | null = null;
try {
const response = await fetch("/api/stream-direct-qa", {
method: "POST",
body: JSON.stringify({
query,
collection: "danswer_index",
use_keyword: searchType === SearchType.KEYWORD,
...(sources.length > 0
? {
filters: [
{
source_type: sources.map((source) => source.internalName),
},
],
}
: {}),
}),
headers: {
"Content-Type": "application/json",
},
});
const reader = response.body?.getReader();
const decoder = new TextDecoder("utf-8");
let previousPartialChunk = null;
while (true) {
const rawChunk = await reader?.read();
if (!rawChunk) {
throw new Error("Unable to process chunk");
}
const { done, value } = rawChunk;
if (done) {
break;
}
// Process each chunk as it arrives
const [completedChunks, partialChunk] = processRawChunkString(
decoder.decode(value, { stream: true }),
previousPartialChunk
);
if (!completedChunks.length && !partialChunk) {
break;
}
if (partialChunk) {
previousPartialChunk = partialChunk;
}
completedChunks.forEach((chunk) => {
// TODO: clean up response / this logic
const answerChunk = chunk.answer_data;
if (answerChunk) {
answer += answerChunk;
updateCurrentAnswer(answer);
} else if (chunk.answer_finished) {
// set quotes as non-null to signify that the answer is finished and
// we're now looking for quotes
updateQuotes({});
if (
!answer.endsWith(".") &&
!answer.endsWith("?") &&
!answer.endsWith("!")
) {
answer += ".";
updateCurrentAnswer(answer);
}
} else {
if (Object.hasOwn(chunk, "top_documents")) {
const docs = chunk.top_documents as any[] | null;
if (docs) {
relevantDocuments = docs.map(
(doc) => JSON.parse(doc) as DanswerDocument
);
updateDocs(relevantDocuments);
}
} else {
quotes = chunk as Record<string, Quote>;
updateQuotes(quotes);
}
}
});
}
} catch (err) {
console.error("Fetch error:", err);
}
return { answer, quotes, relevantDocuments };
};

View File

@ -0,0 +1,39 @@
import { SearchType } from "@/components/search/SearchTypeSelector";
import { ValidSources } from "../types";
export interface Quote {
document_id: string;
link: string;
source_type: ValidSources;
blurb: string;
semantic_identifier: string | null;
}
export interface DanswerDocument {
document_id: string;
link: string;
source_type: ValidSources;
blurb: string;
semantic_identifier: string | null;
}
export interface SearchResponse {
searchType: SearchType;
answer: string | null;
quotes: Record<string, Quote> | null;
documents: DanswerDocument[] | null;
}
export interface Source {
displayName: string;
internalName: ValidSources;
}
export interface SearchRequestArgs {
query: string;
sources: Source[];
updateCurrentAnswer: (val: string) => void;
updateQuotes: (quotes: Record<string, Quote>) => void;
updateDocs: (documents: DanswerDocument[]) => void;
searchType: SearchType;
}

View File

@ -0,0 +1,45 @@
import { DanswerDocument, SearchRequestArgs } from "./interfaces";
interface KeywordResponse {
top_ranked_docs: DanswerDocument[];
semi_ranked_docs: DanswerDocument[];
}
export const keywordSearch = async ({
query,
sources,
updateDocs,
}: SearchRequestArgs): Promise<void> => {
const response = await fetch("/api/keyword-search", {
method: "POST",
body: JSON.stringify({
query,
collection: "danswer_index",
...(sources.length > 0
? {
filters: [
{
source_type: sources.map((source) => source.internalName),
},
],
}
: {}),
}),
headers: {
"Content-Type": "application/json",
},
});
if (!response.ok) {
return;
}
const keywordResults = (await response.json()) as KeywordResponse;
let matchingDocs = keywordResults.top_ranked_docs;
if (keywordResults.semi_ranked_docs) {
matchingDocs = matchingDocs.concat(keywordResults.semi_ranked_docs);
}
updateDocs(matchingDocs);
};