mirror of
https://github.com/danswer-ai/danswer.git
synced 2025-03-28 18:52:31 +01:00
241 lines
14 KiB
Python
241 lines
14 KiB
Python
# This script preps the documents used for initially seeding the index. It handles the embedding so that the
|
|
# documents can be added to the index with minimal processing.
|
|
import json
|
|
|
|
from pydantic import BaseModel
|
|
from sentence_transformers import SentenceTransformer # type: ignore
|
|
|
|
|
|
class SeedPresaveDocument(BaseModel):
|
|
url: str
|
|
title: str
|
|
content: str
|
|
title_embedding: list[float]
|
|
content_embedding: list[float]
|
|
chunk_ind: int = 0
|
|
|
|
|
|
# Be sure to use the default embedding model
|
|
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
|
|
tokenizer = model.tokenizer
|
|
|
|
# This is easier than cleaning up the crawl, needs to be updated if the sites are changed
|
|
overview_title = "Use Cases Overview"
|
|
overview = (
|
|
"How to leverage Onyx in your organization\n\n"
|
|
"Onyx Overview\n"
|
|
"Onyx is the AI Assistant connected to your organization's docs, apps, and people. "
|
|
"Onyx makes Generative AI more versatile for work by enabling new types of questions like "
|
|
'"What is the most common feature request we\'ve heard from customers this month". '
|
|
"Whereas other AI systems have no context of your team and are generally unhelpful with work related questions, "
|
|
"Onyx makes it possible to ask these questions in natural language and get back answers in seconds.\n\n"
|
|
"Onyx can connect to +30 different tools and the use cases are not limited to the ones in the following pages. "
|
|
"The highlighted use cases are for inspiration and come from feedback gathered from our users and customers.\n\n\n"
|
|
"Common Getting Started Questions:\n\n"
|
|
"Why are these docs connected in my Onyx deployment?\n"
|
|
"Answer: This is just an example of how connectors work in Onyx. You can connect up your own team's knowledge "
|
|
"and you will be able to ask questions unique to your organization. Onyx will keep all of the knowledge up to date "
|
|
"and in sync with your connected applications.\n\n"
|
|
"Is my data being sent anywhere when I connect it up to Onyx?\n"
|
|
"Answer: No! Onyx is built with data security as our highest priority. We open sourced it so our users can know "
|
|
"exactly what is going on with their data. By default all of the document processing happens within Onyx. "
|
|
"The only time it is sent outward is for the GenAI call to generate answers.\n\n"
|
|
"Where is the feature for auto sync-ing document level access permissions from all connected sources?\n"
|
|
"Answer: This falls under the Enterprise Edition set of Onyx features built on top of the MIT/community edition. "
|
|
"If you are on Onyx Cloud, you have access to them by default. If you're running it yourself, reach out to the "
|
|
"Onyx team to receive access."
|
|
)
|
|
|
|
enterprise_search_title = "Enterprise Search"
|
|
enterprise_search_1 = (
|
|
"Value of Enterprise Search with Onyx\n\n"
|
|
"What is Enterprise Search and why is it Important?\n"
|
|
"An Enterprise Search system gives team members a single place to access all of the disparate knowledge "
|
|
"of an organization. Critical information is saved across a host of channels like call transcripts with "
|
|
"prospects, engineering design docs, IT runbooks, customer support email exchanges, project management "
|
|
"tickets, and more. As fast moving teams scale up, information gets spread out and more disorganized.\n\n"
|
|
"Since it quickly becomes infeasible to check across every source, decisions get made on incomplete "
|
|
"information, employee satisfaction decreases, and the most valuable members of your team are tied up "
|
|
"with constant distractions as junior teammates are unable to unblock themselves. Onyx solves this "
|
|
"problem by letting anyone on the team access all of the knowledge across your organization in a "
|
|
"permissioned and secure way. Users can ask questions in natural language and get back answers and "
|
|
"documents across all of the connected sources instantly.\n\n"
|
|
"What's the real cost?\n"
|
|
"A typical knowledge worker spends over 2 hours a week on search, but more than that, the cost of "
|
|
"incomplete or incorrect information can be extremely high. Customer support/success that isn't able "
|
|
"to find the reference to similar cases could cause hours or even days of delay leading to lower "
|
|
"customer satisfaction or in the worst case - churn. An account exec not realizing that a prospect had "
|
|
"previously mentioned a specific need could lead to lost deals. An engineer not realizing a similar "
|
|
"feature had previously been built could result in weeks of wasted development time and tech debt with "
|
|
"duplicate implementation. With a lack of knowledge, your whole organization is navigating in the dark "
|
|
"- inefficient and mistake prone."
|
|
)
|
|
|
|
enterprise_search_2 = (
|
|
"More than Search\n"
|
|
"When analyzing the entire corpus of knowledge within your company is as easy as asking a question "
|
|
"in a search bar, your entire team can stay informed and up to date. Onyx also makes it trivial "
|
|
"to identify where knowledge is well documented and where it is lacking. Team members who are centers "
|
|
"of knowledge can begin to effectively document their expertise since it is no longer being thrown into "
|
|
"a black hole. All of this allows the organization to achieve higher efficiency and drive business outcomes.\n\n"
|
|
"With Generative AI, the entire user experience has evolved as well. For example, instead of just finding similar "
|
|
"cases for your customer support team to reference, Onyx breaks down the issue and explains it so that even "
|
|
"the most junior members can understand it. This in turn lets them give the most holistic and technically accurate "
|
|
"response possible to your customers. On the other end, even the super stars of your sales team will not be able "
|
|
"to review 10 hours of transcripts before hopping on that critical call, but Onyx can easily parse through it "
|
|
"in mere seconds and give crucial context to help your team close."
|
|
)
|
|
|
|
ai_platform_title = "AI Platform"
|
|
ai_platform = (
|
|
"Build AI Agents powered by the knowledge and workflows specific to your organization.\n\n"
|
|
"Beyond Answers\n"
|
|
"Agents enabled by generative AI and reasoning capable models are helping teams to automate their work. "
|
|
"Onyx is helping teams make it happen. Onyx provides out of the box user chat sessions, attaching custom tools, "
|
|
"handling LLM reasoning, code execution, data analysis, referencing internal knowledge, and much more.\n\n"
|
|
"Onyx as a platform is not a no-code agent builder. We are made by developers for developers and this gives your "
|
|
"team the full flexibility and power to create agents not constrained by blocks and simple logic paths.\n\n"
|
|
"Flexibility and Extensibility\n"
|
|
"Onyx is open source and completely whitebox. This not only gives transparency to what happens within the system "
|
|
"but also means that your team can directly modify the source code to suit your unique needs."
|
|
)
|
|
|
|
customer_support_title = "Customer Support"
|
|
customer_support = (
|
|
"Help your customer support team instantly answer any question across your entire product.\n\n"
|
|
"AI Enabled Support\n"
|
|
"Customer support agents have one of the highest breadth jobs. They field requests that cover the entire surface "
|
|
"area of the product and need to help your users find success on extremely short timelines. "
|
|
"Because they're not the same people who designed or built the system, they often lack the depth of understanding "
|
|
"needed - resulting in delays and escalations to other teams. Modern teams are leveraging AI to help their CS team "
|
|
"optimize the speed and quality of these critical customer-facing interactions.\n\n"
|
|
"The Importance of Context\n"
|
|
"There are two critical components of AI copilots for customer support. The first is that the AI system needs to be "
|
|
"connected with as much information as possible (not just support tools like Zendesk or Intercom) and that the "
|
|
"knowledge needs to be as fresh as possible. Sometimes a fix might even be in places rarely checked by CS such as "
|
|
"pull requests in a code repository. The second critical component is the ability of the AI system to break down "
|
|
"difficult concepts and convoluted processes into more digestible descriptions and for your team members to be able "
|
|
"to chat back and forth with the system to build a better understanding.\n\n"
|
|
"Onyx takes care of both of these. The system connects up to over 30+ different applications and the knowledge is "
|
|
"pulled in constantly so that the information access is always up to date."
|
|
)
|
|
|
|
sales_title = "Sales"
|
|
sales = (
|
|
"Keep your team up to date on every conversation and update so they can close.\n\n"
|
|
"Recall Every Detail\n"
|
|
"Being able to instantly revisit every detail of any call without reading transcripts is helping Sales teams provide "
|
|
"more tailored pitches, build stronger relationships, and close more deals. Instead of searching and reading through "
|
|
'hours of transcripts in preparation for a call, your team can now ask Onyx "What specific features was ACME '
|
|
"interested in seeing for the demo\". Since your team doesn't have time to read every transcript prior to a call, "
|
|
"Onyx provides a more thorough summary because it can instantly parse hundreds of pages and distill out the relevant "
|
|
"information. Even for fast lookups it becomes much more convenient - for example to brush up on connection building "
|
|
'topics by asking "What rapport building topic did we chat about in the last call with ACME".\n\n'
|
|
"Know Every Product Update\n"
|
|
"It is impossible for Sales teams to keep up with every product update. Because of this, when a prospect has a question "
|
|
"that the Sales team does not know, they have no choice but to rely on the Product and Engineering orgs to get an "
|
|
"authoritative answer. Not only is this distracting to the other teams, it also slows down the time to respond to the "
|
|
"prospect (and as we know, time is the biggest killer of deals). With Onyx, it is even possible to get answers live "
|
|
'on call because of how fast accessing information becomes. A question like "Have we shipped the Microsoft AD '
|
|
'integration yet?" can now be answered in seconds meaning that prospects can get answers while on the call instead of '
|
|
"asynchronously and sales cycles are reduced as a result."
|
|
)
|
|
|
|
operations_title = "Operations"
|
|
operations = (
|
|
"Double the productivity of your Ops teams like IT, HR, etc.\n\n"
|
|
"Automatically Resolve Tickets\n"
|
|
"Modern teams are leveraging AI to auto-resolve up to 50% of tickets. Whether it is an employee asking about benefits "
|
|
"details or how to set up the VPN for remote work, Onyx can help your team help themselves. This frees up your team to "
|
|
"do the real impactful work of landing star candidates or improving your internal processes.\n\n"
|
|
"AI Aided Onboarding\n"
|
|
"One of the periods where your team needs the most help is when they're just ramping up. Instead of feeling lost in dozens "
|
|
"of new tools, Onyx gives them a single place where they can ask about anything in natural language. Whether it's how to "
|
|
"set up their work environment or what their onboarding goals are, Onyx can walk them through every step with the help "
|
|
"of Generative AI. This lets your team feel more empowered and gives time back to the more seasoned members of your team to "
|
|
"focus on moving the needle."
|
|
)
|
|
|
|
# For simplicity, we're not adding any metadata suffix here. Generally there is none for the Web connector anyway
|
|
overview_doc = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/overview",
|
|
title=overview_title,
|
|
content=overview,
|
|
title_embedding=model.encode(f"search_document: {overview_title}"),
|
|
content_embedding=model.encode(f"search_document: {overview_title}\n{overview}"),
|
|
)
|
|
|
|
enterprise_search_doc = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/enterprise_search",
|
|
title=enterprise_search_title,
|
|
content=enterprise_search_1,
|
|
title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
|
|
content_embedding=model.encode(
|
|
f"search_document: {enterprise_search_title}\n{enterprise_search_1}"
|
|
),
|
|
)
|
|
|
|
enterprise_search_doc_2 = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/enterprise_search",
|
|
title=enterprise_search_title,
|
|
content=enterprise_search_2,
|
|
title_embedding=model.encode(f"search_document: {enterprise_search_title}"),
|
|
content_embedding=model.encode(
|
|
f"search_document: {enterprise_search_title}\n{enterprise_search_2}"
|
|
),
|
|
chunk_ind=1,
|
|
)
|
|
|
|
ai_platform_doc = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/ai_platform",
|
|
title=ai_platform_title,
|
|
content=ai_platform,
|
|
title_embedding=model.encode(f"search_document: {ai_platform_title}"),
|
|
content_embedding=model.encode(
|
|
f"search_document: {ai_platform_title}\n{ai_platform}"
|
|
),
|
|
)
|
|
|
|
customer_support_doc = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/customer_support",
|
|
title=customer_support_title,
|
|
content=customer_support,
|
|
title_embedding=model.encode(f"search_document: {customer_support_title}"),
|
|
content_embedding=model.encode(
|
|
f"search_document: {customer_support_title}\n{customer_support}"
|
|
),
|
|
)
|
|
|
|
sales_doc = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/sales",
|
|
title=sales_title,
|
|
content=sales,
|
|
title_embedding=model.encode(f"search_document: {sales_title}"),
|
|
content_embedding=model.encode(f"search_document: {sales_title}\n{sales}"),
|
|
)
|
|
|
|
operations_doc = SeedPresaveDocument(
|
|
url="https://docs.onyx.app/more/use_cases/operations",
|
|
title=operations_title,
|
|
content=operations,
|
|
title_embedding=model.encode(f"search_document: {operations_title}"),
|
|
content_embedding=model.encode(
|
|
f"search_document: {operations_title}\n{operations}"
|
|
),
|
|
)
|
|
|
|
documents = [
|
|
overview_doc,
|
|
enterprise_search_doc,
|
|
enterprise_search_doc_2,
|
|
ai_platform_doc,
|
|
customer_support_doc,
|
|
sales_doc,
|
|
operations_doc,
|
|
]
|
|
|
|
documents_dict = [doc.model_dump() for doc in documents]
|
|
|
|
with open("./backend/onyx/seeding/initial_docs.json", "w") as json_file:
|
|
json.dump(documents_dict, json_file, indent=4)
|