add second task, pdf-to-text

example for kind 5000 event
This commit is contained in:
Believethehype 2023-11-19 16:59:56 +01:00
parent c450c8544f
commit 535d5c2dd6
12 changed files with 148 additions and 30 deletions

View File

@ -5,4 +5,5 @@ USER_DB_PATH = nostrzaps.db
LNBITS_INVOICE_KEY = lnbitswalletinvoicekey
LNBITS_HOST = https://lnbits.com
TASK_TEXTEXTRACTION_NIP89_DTAG = "asdd"
TASK_TRANSLATION_NIP89_DTAG = abcded

View File

@ -1,6 +1,6 @@
# Nostr Data Vending Machine Python Implementation
This example DVM implementation in Python currently supports simple translations using Google translate.
This example DVM implementation in Python currently supports simple translations using Google translate, as well as extraction of text from links with pdf files.
At a later stage, additional example tasks will be added, as well as the integration into a larger Machine Learning backend

20
dvm.py
View File

@ -4,7 +4,7 @@ import time
import emoji
from utils.definitions import EventDefinitions, DVMConfig, RequiredJobToWatch, JobToWatch, LOCAL_TASKS
from utils.admin_utils import admin_make_database_updates
from utils.ai_utils import GoogleTranslate
from utils.ai_utils import extract_text_from_pdf, google_translate
from utils.backend_utils import get_amount_per_task, check_task_is_supported, get_task
from utils.database_utils import update_sql_table, get_from_sql_table, \
create_sql_table, get_or_add_user, update_user_balance
@ -38,6 +38,7 @@ def dvm(config):
dm_zap_filter = Filter().pubkey(pk).kinds([EventDefinitions.KIND_ZAP]).since(Timestamp.now())
dvm_filter = (Filter().kinds([EventDefinitions.KIND_NIP90_GENERIC,
EventDefinitions.KIND_NIP90_EXTRACT_TEXT,
EventDefinitions.KIND_NIP90_TRANSLATE_TEXT,
]).since(Timestamp.now()))
client.subscribe([dm_zap_filter, dvm_filter])
@ -60,13 +61,12 @@ def dvm(config):
user = get_or_add_user(event.pubkey().to_hex())
is_whitelisted = user[2]
is_blacklisted = user[3]
if is_whitelisted:
task_supported, task, duration = check_task_is_supported(event, client=client, get_duration=False,
config=dvm_config)
print(task)
else:
task_supported, task, duration = check_task_is_supported(event, client=client, get_duration=True,
config=dvm_config)
task_supported, task, duration = check_task_is_supported(event, client=client,
get_duration=(not is_whitelisted),
config=dvm_config)
print(task)
if is_blacklisted:
send_job_status_reaction(event, "error", client=client, config=dvm_config)
print("[Nostr] Request by blacklisted user, skipped")
@ -231,7 +231,9 @@ def dvm(config):
options = dict(opts)
if task == "translation":
result = GoogleTranslate(options["text"], options["translation_lang"])
result = google_translate(options["text"], options["translation_lang"])
elif task == "pdf-to-text":
result = extract_text_from_pdf(options["url"])
# TODO ADD FURTHER LOCAL TASKS HERE
check_and_return_event(result, str(job_event.as_json()),

22
main.py
View File

@ -11,21 +11,29 @@ def run_nostr_dvm_with_local_config():
from dvm import dvm, DVMConfig
from utils.nip89_utils import NIP89Announcement
dvmconfig = DVMConfig()
dvmconfig.PRIVATE_KEY = os.getenv(env.NOSTR_PRIVATE_KEY)
dvmconfig.SUPPORTED_TASKS = ["translation"]
dvmconfig.SUPPORTED_TASKS = ["translation", "pdf-to-text"]
dvmconfig.LNBITS_INVOICE_KEY = os.getenv(env.LNBITS_INVOICE_KEY)
dvmconfig.LNBITS_URL = os.getenv(env.LNBITS_HOST)
# In admin_utils, set rebroadcast_nip89 to true to (re)broadcast your DVM. You can create a valid dtag and the content on vendata.io
# Add the dtag in your .env file so you can update your dvm later and change the content here as needed.
nip89extraction = NIP89Announcement()
nip89extraction.kind = EventDefinitions.KIND_NIP90_EXTRACT_TEXT
nip89extraction.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG)
nip89extraction.pk = os.getenv(env.NOSTR_PRIVATE_KEY)
nip89extraction.content = "{\"name\":\"Pdf Extractor\",\"image\":\"https://image.nostr.build/c33ca6fc4cc038ca4adb46fdfdfda34951656f87ee364ef59095bae1495ce669.jpg\",\"about\":\"I extract Text from pdf documents\",\"nip90Params\":{}}"
dvmconfig.NIP89s.append(nip89extraction)
nip89translation = NIP89Announcement()
nip89translation.kind = EventDefinitions.KIND_NIP90_TRANSLATE_TEXT
nip89translation.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG)
nip89translation.pk = os.getenv(env.NOSTR_PRIVATE_KEY)
nip89translation.content = "{\"name\":\"NostrAI DVM Translator\",\"image\":\"https://cdn.nostr.build/i/feb98d8700abe7d6c67d9106a72a20354bf50805af79869638f5a32d24a5ac2a.jpg\",\"about\":\"Translates Text from given text/event/job, currently using Google Translation Services into language defined in param. \",\"nip90Params\":{\"language\":{\"required\":true,\"values\":[\"af\",\"am\",\"ar\",\"az\",\"be\",\"bg\",\"bn\",\"bs\",\"ca\",\"ceb\",\"co\",\"cs\",\"cy\",\"da\",\"de\",\"el\",\"eo\",\"es\",\"et\",\"eu\",\"fa\",\"fi\",\"fr\",\"fy\",\"ga\",\"gd\",\"gl\",\"gu\",\"ha\",\"haw\",\"hi\",\"hmn\",\"hr\",\"ht\",\"hu\",\"hy\",\"id\",\"ig\",\"is\",\"it\",\"he\",\"ja\",\"jv\",\"ka\",\"kk\",\"km\",\"kn\",\"ko\",\"ku\",\"ky\",\"la\",\"lb\",\"lo\",\"lt\",\"lv\",\"mg\",\"mi\",\"mk\",\"ml\",\"mn\",\"mr\",\"ms\",\"mt\",\"my\",\"ne\",\"nl\",\"no\",\"ny\",\"or\",\"pa\",\"pl\",\"ps\",\"pt\",\"ro\",\"ru\",\"sd\",\"si\",\"sk\",\"sl\",\"sm\",\"sn\",\"so\",\"sq\",\"sr\",\"st\",\"su\",\"sv\",\"sw\",\"ta\",\"te\",\"tg\",\"th\",\"tl\",\"tr\",\"ug\",\"uk\",\"ur\",\"uz\",\"vi\",\"xh\",\"yi\",\"yo\",\"zh\",\"zu\"]}}}"
nip89translation.content = "{\"name\":\"Translator\",\"image\":\"https://image.nostr.build/c33ca6fc4cc038ca4adb46fdfdfda34951656f87ee364ef59095bae1495ce669.jpg\",\"about\":\"I translate text from given text/event/job, currently using Google Translation Services into language defined in param. \",\"nip90Params\":{\"language\":{\"required\":true,\"values\":[\"af\",\"am\",\"ar\",\"az\",\"be\",\"bg\",\"bn\",\"bs\",\"ca\",\"ceb\",\"co\",\"cs\",\"cy\",\"da\",\"de\",\"el\",\"eo\",\"es\",\"et\",\"eu\",\"fa\",\"fi\",\"fr\",\"fy\",\"ga\",\"gd\",\"gl\",\"gu\",\"ha\",\"haw\",\"hi\",\"hmn\",\"hr\",\"ht\",\"hu\",\"hy\",\"id\",\"ig\",\"is\",\"it\",\"he\",\"ja\",\"jv\",\"ka\",\"kk\",\"km\",\"kn\",\"ko\",\"ku\",\"ky\",\"la\",\"lb\",\"lo\",\"lt\",\"lv\",\"mg\",\"mi\",\"mk\",\"ml\",\"mn\",\"mr\",\"ms\",\"mt\",\"my\",\"ne\",\"nl\",\"no\",\"ny\",\"or\",\"pa\",\"pl\",\"ps\",\"pt\",\"ro\",\"ru\",\"sd\",\"si\",\"sk\",\"sl\",\"sm\",\"sn\",\"so\",\"sq\",\"sr\",\"st\",\"su\",\"sv\",\"sw\",\"ta\",\"te\",\"tg\",\"th\",\"tl\",\"tr\",\"ug\",\"uk\",\"ur\",\"uz\",\"vi\",\"xh\",\"yi\",\"yo\",\"zh\",\"zu\"]}}}"
dvmconfig.NIP89s.append(nip89translation)
nostr_dvm_thread = Thread(target=dvm, args=[dvmconfig])
nostr_dvm_thread.start()
@ -39,10 +47,4 @@ if __name__ == '__main__':
else:
raise FileNotFoundError(f'.env file not found at {env_path} ')
run_nostr_dvm_with_local_config()

View File

@ -30,3 +30,37 @@ translatepy==2.3
tzdata==2023.3
urllib3==2.1.0
wcwidth==0.2.10
(venv) tobias@Tobiass-MacBook-Pro-2 dvm % pip freeze
beautifulsoup4==4.12.2
bech32==1.2.0
blessed==1.20.0
certifi==2023.7.22
charset-normalizer==3.3.2
emoji==2.8.0
ffmpegio==0.8.5
ffmpegio-core==0.8.5
idna==3.4
inquirer==3.1.3
install==1.3.5
nostr-sdk==0.0.4
numpy==1.26.2
packaging==23.2
pandas==2.1.3
Pillow==10.1.0
pluggy==1.3.0
pycryptodome==3.19.0
pypdf==3.17.1
python-dateutil==2.8.2
python-dotenv==1.0.0
python-editor==1.0.4
pytz==2023.3.post1
pyuseragents==1.0.5
readchar==4.0.5
requests==2.31.0
safeIO==1.2
six==1.16.0
soupsieve==2.5
translatepy==2.3
tzdata==2023.3
urllib3==2.1.0
wcwidth==0.2.10

View File

@ -3,8 +3,8 @@ import time
from nostr_sdk import Keys, EventBuilder, PublicKey
from utils.database_utils import get_from_sql_table, list_db, clear_db, delete_from_sql_table, update_sql_table, \
get_or_add_user, update_user_metadata
from utils.database_utils import get_from_sql_table, list_db, delete_from_sql_table, update_sql_table, \
get_or_add_user, clean_db
from utils.nip89_utils import nip89_announce_tasks
from utils.nostr_utils import send_event
@ -14,7 +14,7 @@ def admin_make_database_updates(config=None, client=None):
dvmconfig = config
rebroadcast_nip89 = False
cleardb = False
cleandb = False
listdatabase = False
deleteuser = False
whitelistuser = False
@ -57,8 +57,8 @@ def admin_make_database_updates(config=None, client=None):
if deleteuser:
delete_from_sql_table(publickey)
if cleardb:
clear_db()
if cleandb:
clean_db()
if listdatabase:
list_db()

View File

@ -1,3 +1,5 @@
import os
#We can add multiple Tasks here and call them in the do_work function.
@ -5,7 +7,7 @@
#the according event type in the beginning of dvm.py and
def GoogleTranslate(text, translation_lang):
def google_translate(text, translation_lang):
from translatepy.translators.google import GoogleTranslate
gtranslate = GoogleTranslate()
length = len(text)
@ -36,3 +38,21 @@ def GoogleTranslate(text, translation_lang):
return translated_text
def extract_text_from_pdf(url):
from pypdf import PdfReader
from pathlib import Path
import requests
file_path = Path('temp.pdf')
response = requests.get(url)
file_path.write_bytes(response.content)
reader = PdfReader(file_path)
number_of_pages = len(reader.pages)
text = ""
for page_num in range(number_of_pages):
page = reader.pages[page_num]
text = text + page.extract_text()
os.remove('temp.pdf')
return text

View File

@ -19,6 +19,30 @@ def get_task(event, client, dvmconfig):
else:
return "unknown job: " + event.as_json()
elif event.kind() == EventDefinitions.KIND_NIP90_EXTRACT_TEXT:
for tag in event.tags():
if tag.as_vec()[0] == "i":
if tag.as_vec()[2] == "url":
file_type = check_url_is_readable(tag.as_vec()[1])
if file_type == "pdf":
return "pdf-to-text"
else:
return "unknown job"
elif tag.as_vec()[2] == "event":
evt = get_event_by_id(tag.as_vec()[1],config=dvmconfig)
if evt is not None:
if evt.kind() == 1063:
for tag in evt.tags():
if tag.as_vec()[0] == 'url':
file_type = check_url_is_readable(tag.as_vec()[1])
if file_type == "pdf":
return "pdf-to-text"
else:
return "unknown job"
else:
return "unknown type"
elif event.kind() == EventDefinitions.KIND_NIP90_TRANSLATE_TEXT:
return "translation"
@ -58,6 +82,8 @@ def check_task_is_supported(event, client, get_duration = False, config=None):
print("No output set")
if task not in dvmconfig.SUPPORTED_TASKS: # The Tasks this DVM supports (can be extended)
return False, task, duration
elif task == "translation" and (
input_type != "event" and input_type != "job" and input_type != "text"): # The input types per task
return False, task, duration
@ -88,12 +114,15 @@ def check_url_is_readable(url):
".mp3") or content_type == 'audio/ogg' or str(url).endswith(".ogg"):
return "audio"
elif content_type == 'image/png' or str(url).endswith(".png") or content_type == 'image/jpg' or str(url).endswith(
".jpg") or content_type == 'image/jpeg' or str(url).endswith(".jpeg") or str(url).endswith(".pdf") or content_type == 'image/png' or str(
".jpg") or content_type == 'image/jpeg' or str(url).endswith(".jpeg") or content_type == 'image/png' or str(
url).endswith(".png"):
return "image"
elif content_type == 'video/mp4' or str(url).endswith(".mp4") or content_type == 'video/avi' or str(url).endswith(
".avi") or content_type == 'video/mov' or str(url).endswith(".mov"):
return "video"
elif (str(url)).endswith(".pdf"):
return "pdf"
# Otherwise we will not offer to do the job.
return None
@ -101,6 +130,9 @@ def get_amount_per_task(task, duration = 0, config=None):
dvmconfig = config
if task == "translation":
amount = dvmconfig.COSTPERUNIT_TRANSLATION
elif task == "pdf-to-text":
amount = dvmconfig.COSTPERUNIT_TEXT_EXTRACTION
else:
print("[Nostr] Task " + task + " is currently not supported by this instance, skipping")
return None

View File

@ -101,7 +101,7 @@ def delete_from_sql_table(npub):
print(e)
def clear_db():
def clean_db():
try:
con = sqlite3.connect(os.getenv(env.USER_DB_PATH))
cur = con.cursor()

View File

@ -3,7 +3,7 @@ from dataclasses import dataclass
from nostr_sdk import Event
NEW_USER_BALANCE = 250
LOCAL_TASKS = ["conversion", "summarization","note-recommendation", "inactive-following", "image-upscale", "translation"]
LOCAL_TASKS = ["pdf-to-text", "translation"]
# Tasks performed by the DVM and not send to nova-server (can change later)
RELAY_LIST = ["wss://relay.damus.io", "wss://nostr-pub.wellorder.net", "wss://nos.lol", "wss://nostr.wine",
@ -54,7 +54,8 @@ class DVMConfig:
SHOWRESULTBEFOREPAYMENT: bool = True # if this is true show results even when not paid right after autoprocess
NEW_USER_BALANCE: int = 250 # Free credits for new users
COSTPERUNIT_TRANSLATION: int = 20 # Still need to multiply this by duration
COSTPERUNIT_TRANSLATION: int = 20
COSTPERUNIT_TEXT_EXTRACTION: int = 20
NIP89s: list = []

View File

@ -7,5 +7,6 @@ LNBITS_INVOICE_KEY = "LNBITS_INVOICE_KEY"
LNBITS_HOST = "LNBITS_HOST"
TASK_TRANSLATION_NIP89_DTAG = "TASK_TRANSLATION_NIP89_DTAG"
TASK_TEXTEXTRACTION_NIP89_DTAG = "TASK_TEXTEXTRACTION_NIP89_DTAG"

View File

@ -62,4 +62,29 @@ def create_requestform_from_nostr_event(event, is_bot=False, client=None, dvmcon
text.replace('\U0001f919', "").replace("=", "equals").
replace(";", ","))
elif task == "pdf-to-text":
input_type = "url"
input_content = ""
url = ""
for tag in event.tags():
if tag.as_vec()[0] == 'i':
input_type = tag.as_vec()[2]
input_content = tag.as_vec()[1]
if input_type == "url":
url = input_content
elif input_type == "event":
evt = get_event_by_id(input_content, config=dvmconfig)
url = re.search("(?P<url>https?://[^\s]+)", evt.content()).group("url")
elif input_type == "job":
evt = get_referenced_event_by_id(input_content, [EventDefinitions.KIND_NIP90_RESULT_GENERATE_IMAGE],
client, config=dvmconfig)
url = re.search("(?P<url>https?://[^\s]+)", evt.content()).group("url")
request_form["optStr"] = 'url=' + url
return request_form