diff --git a/.env_example b/.env_example index 8f5e51f..ea271f8 100644 --- a/.env_example +++ b/.env_example @@ -5,4 +5,5 @@ USER_DB_PATH = nostrzaps.db LNBITS_INVOICE_KEY = lnbitswalletinvoicekey LNBITS_HOST = https://lnbits.com +TASK_TEXTEXTRACTION_NIP89_DTAG = "asdd" TASK_TRANSLATION_NIP89_DTAG = abcded diff --git a/README.md b/README.md index 2367040..f65f607 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Nostr Data Vending Machine Python Implementation -This example DVM implementation in Python currently supports simple translations using Google translate. +This example DVM implementation in Python currently supports simple translations using Google translate, as well as extraction of text from links with pdf files. At a later stage, additional example tasks will be added, as well as the integration into a larger Machine Learning backend diff --git a/dvm.py b/dvm.py index c6f668b..eb65ecf 100644 --- a/dvm.py +++ b/dvm.py @@ -4,7 +4,7 @@ import time import emoji from utils.definitions import EventDefinitions, DVMConfig, RequiredJobToWatch, JobToWatch, LOCAL_TASKS from utils.admin_utils import admin_make_database_updates -from utils.ai_utils import GoogleTranslate +from utils.ai_utils import extract_text_from_pdf, google_translate from utils.backend_utils import get_amount_per_task, check_task_is_supported, get_task from utils.database_utils import update_sql_table, get_from_sql_table, \ create_sql_table, get_or_add_user, update_user_balance @@ -38,6 +38,7 @@ def dvm(config): dm_zap_filter = Filter().pubkey(pk).kinds([EventDefinitions.KIND_ZAP]).since(Timestamp.now()) dvm_filter = (Filter().kinds([EventDefinitions.KIND_NIP90_GENERIC, + EventDefinitions.KIND_NIP90_EXTRACT_TEXT, EventDefinitions.KIND_NIP90_TRANSLATE_TEXT, ]).since(Timestamp.now())) client.subscribe([dm_zap_filter, dvm_filter]) @@ -60,13 +61,12 @@ def dvm(config): user = get_or_add_user(event.pubkey().to_hex()) is_whitelisted = user[2] is_blacklisted = user[3] - if is_whitelisted: - task_supported, task, duration = check_task_is_supported(event, client=client, get_duration=False, - config=dvm_config) - print(task) - else: - task_supported, task, duration = check_task_is_supported(event, client=client, get_duration=True, - config=dvm_config) + + task_supported, task, duration = check_task_is_supported(event, client=client, + get_duration=(not is_whitelisted), + config=dvm_config) + print(task) + if is_blacklisted: send_job_status_reaction(event, "error", client=client, config=dvm_config) print("[Nostr] Request by blacklisted user, skipped") @@ -231,7 +231,9 @@ def dvm(config): options = dict(opts) if task == "translation": - result = GoogleTranslate(options["text"], options["translation_lang"]) + result = google_translate(options["text"], options["translation_lang"]) + elif task == "pdf-to-text": + result = extract_text_from_pdf(options["url"]) # TODO ADD FURTHER LOCAL TASKS HERE check_and_return_event(result, str(job_event.as_json()), diff --git a/main.py b/main.py index 4acb86e..9954a05 100644 --- a/main.py +++ b/main.py @@ -11,21 +11,29 @@ def run_nostr_dvm_with_local_config(): from dvm import dvm, DVMConfig from utils.nip89_utils import NIP89Announcement - dvmconfig = DVMConfig() dvmconfig.PRIVATE_KEY = os.getenv(env.NOSTR_PRIVATE_KEY) - dvmconfig.SUPPORTED_TASKS = ["translation"] + dvmconfig.SUPPORTED_TASKS = ["translation", "pdf-to-text"] dvmconfig.LNBITS_INVOICE_KEY = os.getenv(env.LNBITS_INVOICE_KEY) dvmconfig.LNBITS_URL = os.getenv(env.LNBITS_HOST) + # In admin_utils, set rebroadcast_nip89 to true to (re)broadcast your DVM. You can create a valid dtag and the content on vendata.io + # Add the dtag in your .env file so you can update your dvm later and change the content here as needed. + + nip89extraction = NIP89Announcement() + nip89extraction.kind = EventDefinitions.KIND_NIP90_EXTRACT_TEXT + nip89extraction.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG) + nip89extraction.pk = os.getenv(env.NOSTR_PRIVATE_KEY) + nip89extraction.content = "{\"name\":\"Pdf Extractor\",\"image\":\"https://image.nostr.build/c33ca6fc4cc038ca4adb46fdfdfda34951656f87ee364ef59095bae1495ce669.jpg\",\"about\":\"I extract Text from pdf documents\",\"nip90Params\":{}}" + dvmconfig.NIP89s.append(nip89extraction) + nip89translation = NIP89Announcement() nip89translation.kind = EventDefinitions.KIND_NIP90_TRANSLATE_TEXT nip89translation.dtag = os.getenv(env.TASK_TRANSLATION_NIP89_DTAG) nip89translation.pk = os.getenv(env.NOSTR_PRIVATE_KEY) - nip89translation.content = "{\"name\":\"NostrAI DVM Translator\",\"image\":\"https://cdn.nostr.build/i/feb98d8700abe7d6c67d9106a72a20354bf50805af79869638f5a32d24a5ac2a.jpg\",\"about\":\"Translates Text from given text/event/job, currently using Google Translation Services into language defined in param. \",\"nip90Params\":{\"language\":{\"required\":true,\"values\":[\"af\",\"am\",\"ar\",\"az\",\"be\",\"bg\",\"bn\",\"bs\",\"ca\",\"ceb\",\"co\",\"cs\",\"cy\",\"da\",\"de\",\"el\",\"eo\",\"es\",\"et\",\"eu\",\"fa\",\"fi\",\"fr\",\"fy\",\"ga\",\"gd\",\"gl\",\"gu\",\"ha\",\"haw\",\"hi\",\"hmn\",\"hr\",\"ht\",\"hu\",\"hy\",\"id\",\"ig\",\"is\",\"it\",\"he\",\"ja\",\"jv\",\"ka\",\"kk\",\"km\",\"kn\",\"ko\",\"ku\",\"ky\",\"la\",\"lb\",\"lo\",\"lt\",\"lv\",\"mg\",\"mi\",\"mk\",\"ml\",\"mn\",\"mr\",\"ms\",\"mt\",\"my\",\"ne\",\"nl\",\"no\",\"ny\",\"or\",\"pa\",\"pl\",\"ps\",\"pt\",\"ro\",\"ru\",\"sd\",\"si\",\"sk\",\"sl\",\"sm\",\"sn\",\"so\",\"sq\",\"sr\",\"st\",\"su\",\"sv\",\"sw\",\"ta\",\"te\",\"tg\",\"th\",\"tl\",\"tr\",\"ug\",\"uk\",\"ur\",\"uz\",\"vi\",\"xh\",\"yi\",\"yo\",\"zh\",\"zu\"]}}}" + nip89translation.content = "{\"name\":\"Translator\",\"image\":\"https://image.nostr.build/c33ca6fc4cc038ca4adb46fdfdfda34951656f87ee364ef59095bae1495ce669.jpg\",\"about\":\"I translate text from given text/event/job, currently using Google Translation Services into language defined in param. \",\"nip90Params\":{\"language\":{\"required\":true,\"values\":[\"af\",\"am\",\"ar\",\"az\",\"be\",\"bg\",\"bn\",\"bs\",\"ca\",\"ceb\",\"co\",\"cs\",\"cy\",\"da\",\"de\",\"el\",\"eo\",\"es\",\"et\",\"eu\",\"fa\",\"fi\",\"fr\",\"fy\",\"ga\",\"gd\",\"gl\",\"gu\",\"ha\",\"haw\",\"hi\",\"hmn\",\"hr\",\"ht\",\"hu\",\"hy\",\"id\",\"ig\",\"is\",\"it\",\"he\",\"ja\",\"jv\",\"ka\",\"kk\",\"km\",\"kn\",\"ko\",\"ku\",\"ky\",\"la\",\"lb\",\"lo\",\"lt\",\"lv\",\"mg\",\"mi\",\"mk\",\"ml\",\"mn\",\"mr\",\"ms\",\"mt\",\"my\",\"ne\",\"nl\",\"no\",\"ny\",\"or\",\"pa\",\"pl\",\"ps\",\"pt\",\"ro\",\"ru\",\"sd\",\"si\",\"sk\",\"sl\",\"sm\",\"sn\",\"so\",\"sq\",\"sr\",\"st\",\"su\",\"sv\",\"sw\",\"ta\",\"te\",\"tg\",\"th\",\"tl\",\"tr\",\"ug\",\"uk\",\"ur\",\"uz\",\"vi\",\"xh\",\"yi\",\"yo\",\"zh\",\"zu\"]}}}" dvmconfig.NIP89s.append(nip89translation) - nostr_dvm_thread = Thread(target=dvm, args=[dvmconfig]) nostr_dvm_thread.start() @@ -39,10 +47,4 @@ if __name__ == '__main__': else: raise FileNotFoundError(f'.env file not found at {env_path} ') - - run_nostr_dvm_with_local_config() - - - - diff --git a/requirements.txt b/requirements.txt index 7ac6045..65a607c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,3 +30,37 @@ translatepy==2.3 tzdata==2023.3 urllib3==2.1.0 wcwidth==0.2.10 +(venv) tobias@Tobiass-MacBook-Pro-2 dvm % pip freeze +beautifulsoup4==4.12.2 +bech32==1.2.0 +blessed==1.20.0 +certifi==2023.7.22 +charset-normalizer==3.3.2 +emoji==2.8.0 +ffmpegio==0.8.5 +ffmpegio-core==0.8.5 +idna==3.4 +inquirer==3.1.3 +install==1.3.5 +nostr-sdk==0.0.4 +numpy==1.26.2 +packaging==23.2 +pandas==2.1.3 +Pillow==10.1.0 +pluggy==1.3.0 +pycryptodome==3.19.0 +pypdf==3.17.1 +python-dateutil==2.8.2 +python-dotenv==1.0.0 +python-editor==1.0.4 +pytz==2023.3.post1 +pyuseragents==1.0.5 +readchar==4.0.5 +requests==2.31.0 +safeIO==1.2 +six==1.16.0 +soupsieve==2.5 +translatepy==2.3 +tzdata==2023.3 +urllib3==2.1.0 +wcwidth==0.2.10 diff --git a/utils/admin_utils.py b/utils/admin_utils.py index 9aae0b8..dde6265 100644 --- a/utils/admin_utils.py +++ b/utils/admin_utils.py @@ -3,8 +3,8 @@ import time from nostr_sdk import Keys, EventBuilder, PublicKey -from utils.database_utils import get_from_sql_table, list_db, clear_db, delete_from_sql_table, update_sql_table, \ - get_or_add_user, update_user_metadata +from utils.database_utils import get_from_sql_table, list_db, delete_from_sql_table, update_sql_table, \ + get_or_add_user, clean_db from utils.nip89_utils import nip89_announce_tasks from utils.nostr_utils import send_event @@ -14,7 +14,7 @@ def admin_make_database_updates(config=None, client=None): dvmconfig = config rebroadcast_nip89 = False - cleardb = False + cleandb = False listdatabase = False deleteuser = False whitelistuser = False @@ -57,8 +57,8 @@ def admin_make_database_updates(config=None, client=None): if deleteuser: delete_from_sql_table(publickey) - if cleardb: - clear_db() + if cleandb: + clean_db() if listdatabase: list_db() diff --git a/utils/ai_utils.py b/utils/ai_utils.py index cedcb7c..9fef2ed 100644 --- a/utils/ai_utils.py +++ b/utils/ai_utils.py @@ -1,3 +1,5 @@ +import os + #We can add multiple Tasks here and call them in the do_work function. @@ -5,7 +7,7 @@ #the according event type in the beginning of dvm.py and -def GoogleTranslate(text, translation_lang): +def google_translate(text, translation_lang): from translatepy.translators.google import GoogleTranslate gtranslate = GoogleTranslate() length = len(text) @@ -36,3 +38,21 @@ def GoogleTranslate(text, translation_lang): return translated_text + +def extract_text_from_pdf(url): + from pypdf import PdfReader + from pathlib import Path + import requests + file_path = Path('temp.pdf') + response = requests.get(url) + file_path.write_bytes(response.content) + reader = PdfReader(file_path) + number_of_pages = len(reader.pages) + text = "" + for page_num in range(number_of_pages): + page = reader.pages[page_num] + text = text + page.extract_text() + + os.remove('temp.pdf') + return text + diff --git a/utils/backend_utils.py b/utils/backend_utils.py index ec5efa2..eb49df7 100644 --- a/utils/backend_utils.py +++ b/utils/backend_utils.py @@ -19,6 +19,30 @@ def get_task(event, client, dvmconfig): else: return "unknown job: " + event.as_json() + elif event.kind() == EventDefinitions.KIND_NIP90_EXTRACT_TEXT: + for tag in event.tags(): + if tag.as_vec()[0] == "i": + if tag.as_vec()[2] == "url": + file_type = check_url_is_readable(tag.as_vec()[1]) + if file_type == "pdf": + return "pdf-to-text" + else: + return "unknown job" + elif tag.as_vec()[2] == "event": + evt = get_event_by_id(tag.as_vec()[1],config=dvmconfig) + if evt is not None: + if evt.kind() == 1063: + for tag in evt.tags(): + if tag.as_vec()[0] == 'url': + file_type = check_url_is_readable(tag.as_vec()[1]) + if file_type == "pdf": + return "pdf-to-text" + else: + return "unknown job" + else: + return "unknown type" + + elif event.kind() == EventDefinitions.KIND_NIP90_TRANSLATE_TEXT: return "translation" @@ -58,6 +82,8 @@ def check_task_is_supported(event, client, get_duration = False, config=None): print("No output set") if task not in dvmconfig.SUPPORTED_TASKS: # The Tasks this DVM supports (can be extended) return False, task, duration + + elif task == "translation" and ( input_type != "event" and input_type != "job" and input_type != "text"): # The input types per task return False, task, duration @@ -88,12 +114,15 @@ def check_url_is_readable(url): ".mp3") or content_type == 'audio/ogg' or str(url).endswith(".ogg"): return "audio" elif content_type == 'image/png' or str(url).endswith(".png") or content_type == 'image/jpg' or str(url).endswith( - ".jpg") or content_type == 'image/jpeg' or str(url).endswith(".jpeg") or str(url).endswith(".pdf") or content_type == 'image/png' or str( + ".jpg") or content_type == 'image/jpeg' or str(url).endswith(".jpeg") or content_type == 'image/png' or str( url).endswith(".png"): return "image" elif content_type == 'video/mp4' or str(url).endswith(".mp4") or content_type == 'video/avi' or str(url).endswith( ".avi") or content_type == 'video/mov' or str(url).endswith(".mov"): return "video" + elif (str(url)).endswith(".pdf"): + return "pdf" + # Otherwise we will not offer to do the job. return None @@ -101,6 +130,9 @@ def get_amount_per_task(task, duration = 0, config=None): dvmconfig = config if task == "translation": amount = dvmconfig.COSTPERUNIT_TRANSLATION + elif task == "pdf-to-text": + amount = dvmconfig.COSTPERUNIT_TEXT_EXTRACTION + else: print("[Nostr] Task " + task + " is currently not supported by this instance, skipping") return None diff --git a/utils/database_utils.py b/utils/database_utils.py index 03951e5..d55ae02 100644 --- a/utils/database_utils.py +++ b/utils/database_utils.py @@ -101,7 +101,7 @@ def delete_from_sql_table(npub): print(e) -def clear_db(): +def clean_db(): try: con = sqlite3.connect(os.getenv(env.USER_DB_PATH)) cur = con.cursor() diff --git a/utils/definitions.py b/utils/definitions.py index 6857ae2..1dac005 100644 --- a/utils/definitions.py +++ b/utils/definitions.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from nostr_sdk import Event NEW_USER_BALANCE = 250 -LOCAL_TASKS = ["conversion", "summarization","note-recommendation", "inactive-following", "image-upscale", "translation"] +LOCAL_TASKS = ["pdf-to-text", "translation"] # Tasks performed by the DVM and not send to nova-server (can change later) RELAY_LIST = ["wss://relay.damus.io", "wss://nostr-pub.wellorder.net", "wss://nos.lol", "wss://nostr.wine", @@ -54,7 +54,8 @@ class DVMConfig: SHOWRESULTBEFOREPAYMENT: bool = True # if this is true show results even when not paid right after autoprocess NEW_USER_BALANCE: int = 250 # Free credits for new users - COSTPERUNIT_TRANSLATION: int = 20 # Still need to multiply this by duration + COSTPERUNIT_TRANSLATION: int = 20 + COSTPERUNIT_TEXT_EXTRACTION: int = 20 NIP89s: list = [] diff --git a/utils/env.py b/utils/env.py index 7df959c..1fef1d1 100644 --- a/utils/env.py +++ b/utils/env.py @@ -7,5 +7,6 @@ LNBITS_INVOICE_KEY = "LNBITS_INVOICE_KEY" LNBITS_HOST = "LNBITS_HOST" TASK_TRANSLATION_NIP89_DTAG = "TASK_TRANSLATION_NIP89_DTAG" +TASK_TEXTEXTRACTION_NIP89_DTAG = "TASK_TEXTEXTRACTION_NIP89_DTAG" diff --git a/utils/requestform_utils.py b/utils/requestform_utils.py index 28d4c9f..02df531 100644 --- a/utils/requestform_utils.py +++ b/utils/requestform_utils.py @@ -62,4 +62,29 @@ def create_requestform_from_nostr_event(event, is_bot=False, client=None, dvmcon text.replace('\U0001f919', "").replace("=", "equals"). replace(";", ",")) + + + + elif task == "pdf-to-text": + input_type = "url" + input_content = "" + url = "" + for tag in event.tags(): + if tag.as_vec()[0] == 'i': + input_type = tag.as_vec()[2] + input_content = tag.as_vec()[1] + + if input_type == "url": + url = input_content + elif input_type == "event": + evt = get_event_by_id(input_content, config=dvmconfig) + url = re.search("(?Phttps?://[^\s]+)", evt.content()).group("url") + elif input_type == "job": + evt = get_referenced_event_by_id(input_content, [EventDefinitions.KIND_NIP90_RESULT_GENERATE_IMAGE], + client, config=dvmconfig) + + url = re.search("(?Phttps?://[^\s]+)", evt.content()).group("url") + + request_form["optStr"] = 'url=' + url + return request_form