add second task, pdf-to-text

example for kind 5000 event
This commit is contained in:
Believethehype
2023-11-19 16:59:56 +01:00
parent c450c8544f
commit 535d5c2dd6
12 changed files with 148 additions and 30 deletions

View File

@@ -5,4 +5,5 @@ USER_DB_PATH = nostrzaps.db
LNBITS_INVOICE_KEY = lnbitswalletinvoicekey LNBITS_INVOICE_KEY = lnbitswalletinvoicekey
LNBITS_HOST = https://lnbits.com LNBITS_HOST = https://lnbits.com
TASK_TEXTEXTRACTION_NIP89_DTAG = "asdd"
TASK_TRANSLATION_NIP89_DTAG = abcded TASK_TRANSLATION_NIP89_DTAG = abcded

View File

@@ -1,6 +1,6 @@
# Nostr Data Vending Machine Python Implementation # Nostr Data Vending Machine Python Implementation
This example DVM implementation in Python currently supports simple translations using Google translate. This example DVM implementation in Python currently supports simple translations using Google translate, as well as extraction of text from links with pdf files.
At a later stage, additional example tasks will be added, as well as the integration into a larger Machine Learning backend At a later stage, additional example tasks will be added, as well as the integration into a larger Machine Learning backend

16
dvm.py
View File

@@ -4,7 +4,7 @@ import time
import emoji import emoji
from utils.definitions import EventDefinitions, DVMConfig, RequiredJobToWatch, JobToWatch, LOCAL_TASKS from utils.definitions import EventDefinitions, DVMConfig, RequiredJobToWatch, JobToWatch, LOCAL_TASKS
from utils.admin_utils import admin_make_database_updates from utils.admin_utils import admin_make_database_updates
from utils.ai_utils import GoogleTranslate from utils.ai_utils import extract_text_from_pdf, google_translate
from utils.backend_utils import get_amount_per_task, check_task_is_supported, get_task from utils.backend_utils import get_amount_per_task, check_task_is_supported, get_task
from utils.database_utils import update_sql_table, get_from_sql_table, \ from utils.database_utils import update_sql_table, get_from_sql_table, \
create_sql_table, get_or_add_user, update_user_balance create_sql_table, get_or_add_user, update_user_balance
@@ -38,6 +38,7 @@ def dvm(config):
dm_zap_filter = Filter().pubkey(pk).kinds([EventDefinitions.KIND_ZAP]).since(Timestamp.now()) dm_zap_filter = Filter().pubkey(pk).kinds([EventDefinitions.KIND_ZAP]).since(Timestamp.now())
dvm_filter = (Filter().kinds([EventDefinitions.KIND_NIP90_GENERIC, dvm_filter = (Filter().kinds([EventDefinitions.KIND_NIP90_GENERIC,
EventDefinitions.KIND_NIP90_EXTRACT_TEXT,
EventDefinitions.KIND_NIP90_TRANSLATE_TEXT, EventDefinitions.KIND_NIP90_TRANSLATE_TEXT,
]).since(Timestamp.now())) ]).since(Timestamp.now()))
client.subscribe([dm_zap_filter, dvm_filter]) client.subscribe([dm_zap_filter, dvm_filter])
@@ -60,13 +61,12 @@ def dvm(config):
user = get_or_add_user(event.pubkey().to_hex()) user = get_or_add_user(event.pubkey().to_hex())
is_whitelisted = user[2] is_whitelisted = user[2]
is_blacklisted = user[3] is_blacklisted = user[3]
if is_whitelisted:
task_supported, task, duration = check_task_is_supported(event, client=client, get_duration=False, task_supported, task, duration = check_task_is_supported(event, client=client,
get_duration=(not is_whitelisted),
config=dvm_config) config=dvm_config)
print(task) print(task)
else:
task_supported, task, duration = check_task_is_supported(event, client=client, get_duration=True,
config=dvm_config)
if is_blacklisted: if is_blacklisted:
send_job_status_reaction(event, "error", client=client, config=dvm_config) send_job_status_reaction(event, "error", client=client, config=dvm_config)
print("[Nostr] Request by blacklisted user, skipped") print("[Nostr] Request by blacklisted user, skipped")
@@ -231,7 +231,9 @@ def dvm(config):
options = dict(opts) options = dict(opts)
if task == "translation": if task == "translation":
result = GoogleTranslate(options["text"], options["translation_lang"]) result = google_translate(options["text"], options["translation_lang"])
elif task == "pdf-to-text":
result = extract_text_from_pdf(options["url"])
# TODO ADD FURTHER LOCAL TASKS HERE # TODO ADD FURTHER LOCAL TASKS HERE
check_and_return_event(result, str(job_event.as_json()), check_and_return_event(result, str(job_event.as_json()),

22
main.py
View File

@@ -11,21 +11,29 @@ def run_nostr_dvm_with_local_config():
from dvm import dvm, DVMConfig from dvm import dvm, DVMConfig
from utils.nip89_utils import NIP89Announcement from utils.nip89_utils import NIP89Announcement
dvmconfig = DVMConfig() dvmconfig = DVMConfig()
dvmconfig.PRIVATE_KEY = os.getenv(env.NOSTR_PRIVATE_KEY) dvmconfig.PRIVATE_KEY = os.getenv(env.NOSTR_PRIVATE_KEY)
dvmconfig.SUPPORTED_TASKS = ["translation"] dvmconfig.SUPPORTED_TASKS = ["translation", "pdf-to-text"]
dvmconfig.LNBITS_INVOICE_KEY = os.getenv(env.LNBITS_INVOICE_KEY) dvmconfig.LNBITS_INVOICE_KEY = os.getenv(env.LNBITS_INVOICE_KEY)
dvmconfig.LNBITS_URL = os.getenv(env.LNBITS_HOST) dvmconfig.LNBITS_URL = os.getenv(env.LNBITS_HOST)
# In admin_utils, set rebroadcast_nip89 to true to (re)broadcast your DVM. You can create a valid dtag and the content on vendata.io
# Add the dtag in your .env file so you can update your dvm later and change the content here as needed.
nip89extraction = NIP89Announcement()
nip89extraction.kind = EventDefinitions.KIND_NIP90_EXTRACT_TEXT
nip89extraction.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG)
nip89extraction.pk = os.getenv(env.NOSTR_PRIVATE_KEY)
nip89extraction.content = "{\"name\":\"Pdf Extractor\",\"image\":\"https://image.nostr.build/c33ca6fc4cc038ca4adb46fdfdfda34951656f87ee364ef59095bae1495ce669.jpg\",\"about\":\"I extract Text from pdf documents\",\"nip90Params\":{}}"
dvmconfig.NIP89s.append(nip89extraction)
nip89translation = NIP89Announcement() nip89translation = NIP89Announcement()
nip89translation.kind = EventDefinitions.KIND_NIP90_TRANSLATE_TEXT nip89translation.kind = EventDefinitions.KIND_NIP90_TRANSLATE_TEXT
nip89translation.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG) nip89translation.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG)
nip89translation.pk = os.getenv(env.NOSTR_PRIVATE_KEY) nip89translation.pk = os.getenv(env.NOSTR_PRIVATE_KEY)
nip89translation.content = "{\"name\":\"NostrAI DVM Translator\",\"image\":\"https://cdn.nostr.build/i/feb98d8700abe7d6c67d9106a72a20354bf50805af79869638f5a32d24a5ac2a.jpg\",\"about\":\"Translates Text from given text/event/job, currently using Google Translation Services into language defined in param. \",\"nip90Params\":{\"language\":{\"required\":true,\"values\":[\"af\",\"am\",\"ar\",\"az\",\"be\",\"bg\",\"bn\",\"bs\",\"ca\",\"ceb\",\"co\",\"cs\",\"cy\",\"da\",\"de\",\"el\",\"eo\",\"es\",\"et\",\"eu\",\"fa\",\"fi\",\"fr\",\"fy\",\"ga\",\"gd\",\"gl\",\"gu\",\"ha\",\"haw\",\"hi\",\"hmn\",\"hr\",\"ht\",\"hu\",\"hy\",\"id\",\"ig\",\"is\",\"it\",\"he\",\"ja\",\"jv\",\"ka\",\"kk\",\"km\",\"kn\",\"ko\",\"ku\",\"ky\",\"la\",\"lb\",\"lo\",\"lt\",\"lv\",\"mg\",\"mi\",\"mk\",\"ml\",\"mn\",\"mr\",\"ms\",\"mt\",\"my\",\"ne\",\"nl\",\"no\",\"ny\",\"or\",\"pa\",\"pl\",\"ps\",\"pt\",\"ro\",\"ru\",\"sd\",\"si\",\"sk\",\"sl\",\"sm\",\"sn\",\"so\",\"sq\",\"sr\",\"st\",\"su\",\"sv\",\"sw\",\"ta\",\"te\",\"tg\",\"th\",\"tl\",\"tr\",\"ug\",\"uk\",\"ur\",\"uz\",\"vi\",\"xh\",\"yi\",\"yo\",\"zh\",\"zu\"]}}}" nip89translation.content = "{\"name\":\"Translator\",\"image\":\"https://image.nostr.build/c33ca6fc4cc038ca4adb46fdfdfda34951656f87ee364ef59095bae1495ce669.jpg\",\"about\":\"I translate text from given text/event/job, currently using Google Translation Services into language defined in param. \",\"nip90Params\":{\"language\":{\"required\":true,\"values\":[\"af\",\"am\",\"ar\",\"az\",\"be\",\"bg\",\"bn\",\"bs\",\"ca\",\"ceb\",\"co\",\"cs\",\"cy\",\"da\",\"de\",\"el\",\"eo\",\"es\",\"et\",\"eu\",\"fa\",\"fi\",\"fr\",\"fy\",\"ga\",\"gd\",\"gl\",\"gu\",\"ha\",\"haw\",\"hi\",\"hmn\",\"hr\",\"ht\",\"hu\",\"hy\",\"id\",\"ig\",\"is\",\"it\",\"he\",\"ja\",\"jv\",\"ka\",\"kk\",\"km\",\"kn\",\"ko\",\"ku\",\"ky\",\"la\",\"lb\",\"lo\",\"lt\",\"lv\",\"mg\",\"mi\",\"mk\",\"ml\",\"mn\",\"mr\",\"ms\",\"mt\",\"my\",\"ne\",\"nl\",\"no\",\"ny\",\"or\",\"pa\",\"pl\",\"ps\",\"pt\",\"ro\",\"ru\",\"sd\",\"si\",\"sk\",\"sl\",\"sm\",\"sn\",\"so\",\"sq\",\"sr\",\"st\",\"su\",\"sv\",\"sw\",\"ta\",\"te\",\"tg\",\"th\",\"tl\",\"tr\",\"ug\",\"uk\",\"ur\",\"uz\",\"vi\",\"xh\",\"yi\",\"yo\",\"zh\",\"zu\"]}}}"
dvmconfig.NIP89s.append(nip89translation) dvmconfig.NIP89s.append(nip89translation)
nostr_dvm_thread = Thread(target=dvm, args=[dvmconfig]) nostr_dvm_thread = Thread(target=dvm, args=[dvmconfig])
nostr_dvm_thread.start() nostr_dvm_thread.start()
@@ -39,10 +47,4 @@ if __name__ == '__main__':
else: else:
raise FileNotFoundError(f'.env file not found at {env_path} ') raise FileNotFoundError(f'.env file not found at {env_path} ')
run_nostr_dvm_with_local_config() run_nostr_dvm_with_local_config()

View File

@@ -30,3 +30,37 @@ translatepy==2.3
tzdata==2023.3 tzdata==2023.3
urllib3==2.1.0 urllib3==2.1.0
wcwidth==0.2.10 wcwidth==0.2.10
(venv) tobias@Tobiass-MacBook-Pro-2 dvm % pip freeze
beautifulsoup4==4.12.2
bech32==1.2.0
blessed==1.20.0
certifi==2023.7.22
charset-normalizer==3.3.2
emoji==2.8.0
ffmpegio==0.8.5
ffmpegio-core==0.8.5
idna==3.4
inquirer==3.1.3
install==1.3.5
nostr-sdk==0.0.4
numpy==1.26.2
packaging==23.2
pandas==2.1.3
Pillow==10.1.0
pluggy==1.3.0
pycryptodome==3.19.0
pypdf==3.17.1
python-dateutil==2.8.2
python-dotenv==1.0.0
python-editor==1.0.4
pytz==2023.3.post1
pyuseragents==1.0.5
readchar==4.0.5
requests==2.31.0
safeIO==1.2
six==1.16.0
soupsieve==2.5
translatepy==2.3
tzdata==2023.3
urllib3==2.1.0
wcwidth==0.2.10

View File

@@ -3,8 +3,8 @@ import time
from nostr_sdk import Keys, EventBuilder, PublicKey from nostr_sdk import Keys, EventBuilder, PublicKey
from utils.database_utils import get_from_sql_table, list_db, clear_db, delete_from_sql_table, update_sql_table, \ from utils.database_utils import get_from_sql_table, list_db, delete_from_sql_table, update_sql_table, \
get_or_add_user, update_user_metadata get_or_add_user, clean_db
from utils.nip89_utils import nip89_announce_tasks from utils.nip89_utils import nip89_announce_tasks
from utils.nostr_utils import send_event from utils.nostr_utils import send_event
@@ -14,7 +14,7 @@ def admin_make_database_updates(config=None, client=None):
dvmconfig = config dvmconfig = config
rebroadcast_nip89 = False rebroadcast_nip89 = False
cleardb = False cleandb = False
listdatabase = False listdatabase = False
deleteuser = False deleteuser = False
whitelistuser = False whitelistuser = False
@@ -57,8 +57,8 @@ def admin_make_database_updates(config=None, client=None):
if deleteuser: if deleteuser:
delete_from_sql_table(publickey) delete_from_sql_table(publickey)
if cleardb: if cleandb:
clear_db() clean_db()
if listdatabase: if listdatabase:
list_db() list_db()

View File

@@ -1,3 +1,5 @@
import os
#We can add multiple Tasks here and call them in the do_work function. #We can add multiple Tasks here and call them in the do_work function.
@@ -5,7 +7,7 @@
#the according event type in the beginning of dvm.py and #the according event type in the beginning of dvm.py and
def GoogleTranslate(text, translation_lang): def google_translate(text, translation_lang):
from translatepy.translators.google import GoogleTranslate from translatepy.translators.google import GoogleTranslate
gtranslate = GoogleTranslate() gtranslate = GoogleTranslate()
length = len(text) length = len(text)
@@ -36,3 +38,21 @@ def GoogleTranslate(text, translation_lang):
return translated_text return translated_text
def extract_text_from_pdf(url):
from pypdf import PdfReader
from pathlib import Path
import requests
file_path = Path('temp.pdf')
response = requests.get(url)
file_path.write_bytes(response.content)
reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
text = ""
for page_num in range(number_of_pages):
page = reader.pages[page_num]
text = text + page.extract_text()
os.remove('temp.pdf')
return text

View File

@@ -19,6 +19,30 @@ def get_task(event, client, dvmconfig):
else: else:
return "unknown job: " + event.as_json() return "unknown job: " + event.as_json()
elif event.kind() == EventDefinitions.KIND_NIP90_EXTRACT_TEXT:
for tag in event.tags():
if tag.as_vec()[0] == "i":
if tag.as_vec()[2] == "url":
file_type = check_url_is_readable(tag.as_vec()[1])
if file_type == "pdf":
return "pdf-to-text"
else:
return "unknown job"
elif tag.as_vec()[2] == "event":
evt = get_event_by_id(tag.as_vec()[1],config=dvmconfig)
if evt is not None:
if evt.kind() == 1063:
for tag in evt.tags():
if tag.as_vec()[0] == 'url':
file_type = check_url_is_readable(tag.as_vec()[1])
if file_type == "pdf":
return "pdf-to-text"
else:
return "unknown job"
else:
return "unknown type"
elif event.kind() == EventDefinitions.KIND_NIP90_TRANSLATE_TEXT: elif event.kind() == EventDefinitions.KIND_NIP90_TRANSLATE_TEXT:
return "translation" return "translation"
@@ -58,6 +82,8 @@ def check_task_is_supported(event, client, get_duration = False, config=None):
print("No output set") print("No output set")
if task not in dvmconfig.SUPPORTED_TASKS: # The Tasks this DVM supports (can be extended) if task not in dvmconfig.SUPPORTED_TASKS: # The Tasks this DVM supports (can be extended)
return False, task, duration return False, task, duration
elif task == "translation" and ( elif task == "translation" and (
input_type != "event" and input_type != "job" and input_type != "text"): # The input types per task input_type != "event" and input_type != "job" and input_type != "text"): # The input types per task
return False, task, duration return False, task, duration
@@ -88,12 +114,15 @@ def check_url_is_readable(url):
".mp3") or content_type == 'audio/ogg' or str(url).endswith(".ogg"): ".mp3") or content_type == 'audio/ogg' or str(url).endswith(".ogg"):
return "audio" return "audio"
elif content_type == 'image/png' or str(url).endswith(".png") or content_type == 'image/jpg' or str(url).endswith( elif content_type == 'image/png' or str(url).endswith(".png") or content_type == 'image/jpg' or str(url).endswith(
".jpg") or content_type == 'image/jpeg' or str(url).endswith(".jpeg") or str(url).endswith(".pdf") or content_type == 'image/png' or str( ".jpg") or content_type == 'image/jpeg' or str(url).endswith(".jpeg") or content_type == 'image/png' or str(
url).endswith(".png"): url).endswith(".png"):
return "image" return "image"
elif content_type == 'video/mp4' or str(url).endswith(".mp4") or content_type == 'video/avi' or str(url).endswith( elif content_type == 'video/mp4' or str(url).endswith(".mp4") or content_type == 'video/avi' or str(url).endswith(
".avi") or content_type == 'video/mov' or str(url).endswith(".mov"): ".avi") or content_type == 'video/mov' or str(url).endswith(".mov"):
return "video" return "video"
elif (str(url)).endswith(".pdf"):
return "pdf"
# Otherwise we will not offer to do the job. # Otherwise we will not offer to do the job.
return None return None
@@ -101,6 +130,9 @@ def get_amount_per_task(task, duration = 0, config=None):
dvmconfig = config dvmconfig = config
if task == "translation": if task == "translation":
amount = dvmconfig.COSTPERUNIT_TRANSLATION amount = dvmconfig.COSTPERUNIT_TRANSLATION
elif task == "pdf-to-text":
amount = dvmconfig.COSTPERUNIT_TEXT_EXTRACTION
else: else:
print("[Nostr] Task " + task + " is currently not supported by this instance, skipping") print("[Nostr] Task " + task + " is currently not supported by this instance, skipping")
return None return None

View File

@@ -101,7 +101,7 @@ def delete_from_sql_table(npub):
print(e) print(e)
def clear_db(): def clean_db():
try: try:
con = sqlite3.connect(os.getenv(env.USER_DB_PATH)) con = sqlite3.connect(os.getenv(env.USER_DB_PATH))
cur = con.cursor() cur = con.cursor()

View File

@@ -3,7 +3,7 @@ from dataclasses import dataclass
from nostr_sdk import Event from nostr_sdk import Event
NEW_USER_BALANCE = 250 NEW_USER_BALANCE = 250
LOCAL_TASKS = ["conversion", "summarization","note-recommendation", "inactive-following", "image-upscale", "translation"] LOCAL_TASKS = ["pdf-to-text", "translation"]
# Tasks performed by the DVM and not send to nova-server (can change later) # Tasks performed by the DVM and not send to nova-server (can change later)
RELAY_LIST = ["wss://relay.damus.io", "wss://nostr-pub.wellorder.net", "wss://nos.lol", "wss://nostr.wine", RELAY_LIST = ["wss://relay.damus.io", "wss://nostr-pub.wellorder.net", "wss://nos.lol", "wss://nostr.wine",
@@ -54,7 +54,8 @@ class DVMConfig:
SHOWRESULTBEFOREPAYMENT: bool = True # if this is true show results even when not paid right after autoprocess SHOWRESULTBEFOREPAYMENT: bool = True # if this is true show results even when not paid right after autoprocess
NEW_USER_BALANCE: int = 250 # Free credits for new users NEW_USER_BALANCE: int = 250 # Free credits for new users
COSTPERUNIT_TRANSLATION: int = 20 # Still need to multiply this by duration COSTPERUNIT_TRANSLATION: int = 20
COSTPERUNIT_TEXT_EXTRACTION: int = 20
NIP89s: list = [] NIP89s: list = []

View File

@@ -7,5 +7,6 @@ LNBITS_INVOICE_KEY = "LNBITS_INVOICE_KEY"
LNBITS_HOST = "LNBITS_HOST" LNBITS_HOST = "LNBITS_HOST"
TASK_TRANSLATION_NIP89_DTAG = "TASK_TRANSLATION_NIP89_DTAG" TASK_TRANSLATION_NIP89_DTAG = "TASK_TRANSLATION_NIP89_DTAG"
TASK_TEXTEXTRACTION_NIP89_DTAG = "TASK_TEXTEXTRACTION_NIP89_DTAG"

View File

@@ -62,4 +62,29 @@ def create_requestform_from_nostr_event(event, is_bot=False, client=None, dvmcon
text.replace('\U0001f919', "").replace("=", "equals"). text.replace('\U0001f919', "").replace("=", "equals").
replace(";", ",")) replace(";", ","))
elif task == "pdf-to-text":
input_type = "url"
input_content = ""
url = ""
for tag in event.tags():
if tag.as_vec()[0] == 'i':
input_type = tag.as_vec()[2]
input_content = tag.as_vec()[1]
if input_type == "url":
url = input_content
elif input_type == "event":
evt = get_event_by_id(input_content, config=dvmconfig)
url = re.search("(?P<url>https?://[^\s]+)", evt.content()).group("url")
elif input_type == "job":
evt = get_referenced_event_by_id(input_content, [EventDefinitions.KIND_NIP90_RESULT_GENERATE_IMAGE],
client, config=dvmconfig)
url = re.search("(?P<url>https?://[^\s]+)", evt.content()).group("url")
request_form["optStr"] = 'url=' + url
return request_form return request_form