From 3043c188f5a3c1524b7355e79f776a2154bfae8a Mon Sep 17 00:00:00 2001
From: Believethehype <tobias_baur@yahoo.com>
Date: Mon, 18 Dec 2023 13:57:53 +0100
Subject: [PATCH] fixing media converter tools

---
 nostr_dvm/tasks/convert_media.py           |  2 +-
 nostr_dvm/utils/mediasource_utils.py       |  2 +-
 nostr_dvm/utils/scrapper/media_scrapper.py | 64 ++++++++++++++++++----
 3 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/nostr_dvm/tasks/convert_media.py b/nostr_dvm/tasks/convert_media.py
index 2d30475..fa8655c 100644
--- a/nostr_dvm/tasks/convert_media.py
+++ b/nostr_dvm/tasks/convert_media.py
@@ -4,7 +4,7 @@ from nostr_dvm.interfaces.dvmtaskinterface import DVMTaskInterface
 from nostr_dvm.utils.admin_utils import AdminConfig
 from nostr_dvm.utils.definitions import EventDefinitions
 from nostr_dvm.utils.dvmconfig import DVMConfig, build_default_config
-from nostr_dvm.utils.nip89_utils import NIP89Config, check_and_set_d_tag
+from nostr_dvm.utils.nip89_utils import NIP89Config
 from nostr_dvm.utils.mediasource_utils import organize_input_media_data
 from nostr_dvm.utils.output_utils import upload_media_to_hoster
 
diff --git a/nostr_dvm/utils/mediasource_utils.py b/nostr_dvm/utils/mediasource_utils.py
index 9ae98f7..6c7d40f 100644
--- a/nostr_dvm/utils/mediasource_utils.py
+++ b/nostr_dvm/utils/mediasource_utils.py
@@ -20,7 +20,7 @@ def input_data_file_duration(event, dvm_config, client, start=0, end=0):
             input_type = tag.as_vec()[2]
 
     if input_type == "text":
-        # For now, ingore length of any text, just return 1.
+        # For now, ignore length of any text, just return 1.
         return 1
 
     if input_type == "event":  # NIP94 event
diff --git a/nostr_dvm/utils/scrapper/media_scrapper.py b/nostr_dvm/utils/scrapper/media_scrapper.py
index bff4807..16ba2e1 100644
--- a/nostr_dvm/utils/scrapper/media_scrapper.py
+++ b/nostr_dvm/utils/scrapper/media_scrapper.py
@@ -18,7 +18,15 @@ def XitterDownload(source_url, target_location):
     features, variables = request_details["features"], request_details["variables"]
 
     def get_tokens(tweet_url):
-        html = requests.get(tweet_url)
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0",
+            "Accept": "*/*",
+            "Accept-Language": "de,en-US;q=0.7,en;q=0.3",
+            "Accept-Encoding": "gzip, deflate, br",
+            "TE": "trailers",
+        }
+
+        html = requests.get(tweet_url, headers=headers)
 
         assert (
                 html.status_code == 200
@@ -34,7 +42,6 @@ def XitterDownload(source_url, target_location):
         ), f"Failed to find main.js file.  If you are using the correct Twitter URL this suggests a bug in the script.  Please open a GitHub issue and copy and paste this message.  Tweet url: {tweet_url}"
 
         mainjs_url = mainjs_url[0]
-
         mainjs = requests.get(mainjs_url)
 
         assert (
@@ -80,9 +87,11 @@ def XitterDownload(source_url, target_location):
         variables["tweetId"] = tweet_id
 
         return f"https://twitter.com/i/api/graphql/0hWvDhmW8YQ-S_ib3azIrw/TweetResultByRestId?variables={urllib.parse.quote(json.dumps(variables))}&features={urllib.parse.quote(json.dumps(features))}"
+        # return f"https://api.twitter.com/graphql/ncDeACNGIApPMaqGVuF_rw/TweetResultByRestId?variables={urllib.parse.quote(json.dumps(variables))}&features={urllib.parse.quote(json.dumps(features))}"
 
     def get_tweet_details(tweet_url, guest_token, bearer_token):
         tweet_id = re.findall(r"(?<=status/)\d+", tweet_url)
+
         assert (
                 tweet_id is not None and len(tweet_id) == 1
         ), f"Could not parse tweet id from your url.  Make sure you are using the correct url.  If you are, then file a GitHub issue and copy and paste this message.  Tweet url: {tweet_url}"
@@ -172,7 +181,7 @@ def XitterDownload(source_url, target_location):
         pattern = (
                 r'"expanded_url"\s*:\s*"https://x\.com/[^/]+/status/'
                 + sid
-                + '/[^"]+",\s*"id_str"\s*:\s*"\d+",'
+                + r'/[^"]+",\s*"id_str"\s*:\s*"\d+",'
         )
         matches = re.findall(pattern, j)
         if len(matches) > 0:
@@ -240,17 +249,49 @@ def XitterDownload(source_url, target_location):
             urls = [x["url"] for x in results.values()]
             urls += container_matches
             return urls
-
         return [x["url"] for x in results.values()]
 
+    def extract_mp4_fmp4(j):
+        """
+        Extract the URL of the MP4 video from the detailed information of the tweet.
+        Returns a list of URLs, tweet IDs, and resolution information (dictionary type)
+        and a list of tweet IDs as return values.
+        """
+
+        # Empty list to store tweet IDs
+        tweet_id_list = []
+        mp4_info_dict_list = []
+        amplitude_pattern = re.compile(
+            r"(https://video.twimg.com/amplify_video/(\d+)/vid/(avc1/)(\d+x\d+)/[^.]+.mp4\?tag=\d+)"
+        )
+        ext_tw_pattern = re.compile(
+            r"(https://video.twimg.com/ext_tw_video/(\d+)/pu/vid/(avc1/)?(\d+x\d+)/[^.]+.mp4\?tag=\d+)"
+        )
+        tweet_video_pattern = re.compile(r'https://video.twimg.com/tweet_video/[^"]+')
+        container_pattern = re.compile(r'https://video.twimg.com/[^"]*container=fmp4')
+
+        matches = amplitude_pattern.findall(j)
+        matches += ext_tw_pattern.findall(j)
+        container_matches = container_pattern.findall(j)
+        tweet_video_url_list = tweet_video_pattern.findall(j)
+
+        for match in matches:
+            url, tweet_id, _, resolution = match
+            tweet_id_list.append(int(tweet_id))
+            mp4_info_dict_list.append({"resolution": resolution, "url": url})
+
+        tweet_id_list = list(dict.fromkeys(tweet_id_list))
+
+        if len(container_matches) > 0:
+            for url in container_matches:
+                mp4_info_dict_list.append({"url": url})
+
+        return tweet_id_list, mp4_info_dict_list, tweet_video_url_list
+
     def download_parts(url, output_filename):
         resp = requests.get(url, stream=True)
-
-        # container begins with / ends with fmp4 and has a resolution in it we want to capture
         pattern = re.compile(r"(/[^\n]*/(\d+x\d+)/[^\n]*container=fmp4)")
-
         matches = pattern.findall(resp.text)
-
         max_res = 0
         max_res_url = None
 
@@ -301,12 +342,9 @@ def XitterDownload(source_url, target_location):
 
     def repost_check(j, exclude_replies=True):
         try:
-            # This line extract the index of the first reply
             reply_index = j.index('"conversationthread-')
         except ValueError:
-            # If there are no replies we use the enrire response data length
             reply_index = len(j)
-        # We truncate the response data to exclude replies
         if exclude_replies:
             j = j[0:reply_index]
 
@@ -360,6 +398,7 @@ def XitterDownload(source_url, target_location):
         bearer_token, guest_token = get_tokens(tweet_url)
         resp = get_tweet_details(tweet_url, guest_token, bearer_token)
         mp4s = extract_mp4s(resp.text, tweet_url, target_all_videos)
+
         if target_all_videos:
             video_counter = 1
             original_urls = repost_check(resp.text, exclude_replies=False)
@@ -377,6 +416,7 @@ def XitterDownload(source_url, target_location):
                             download_parts(mp4, output_file)
 
                         else:
+                            # use a stream to download the file
                             r = requests.get(mp4, stream=True)
                             with open(output_file, "wb") as f:
                                 for chunk in r.iter_content(chunk_size=1024):
@@ -475,7 +515,7 @@ def TiktokDownloadAll(linkList, path) -> str:
     for i in linkList:
         try:
             data['url'] = i
-            result = TikTokDownload(cookies, headers, data, "tiktok", path) # str(linkList.index(i))
+            result = TikTokDownload(cookies, headers, data, "tiktok", path)  # str(linkList.index(i))
             return result
         except IndexError:
             parseDict = getDict()