nostrdvm/nostr_dvm/utils/scrapper/media_scrapper.py
2023-12-18 13:57:53 +01:00

640 lines
26 KiB
Python

import json
import os
import re
import sys
import urllib.parse
from typing import Any
from urllib.request import urlopen, Request
import requests
import instaloader
from pytube import YouTube
def XitterDownload(source_url, target_location):
script_dir = os.path.dirname(os.path.realpath(__file__))
request_details_file = f"{script_dir}{os.sep}request_details.json"
request_details = json.load(open(request_details_file, "r")) # test
features, variables = request_details["features"], request_details["variables"]
def get_tokens(tweet_url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0",
"Accept": "*/*",
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
"TE": "trailers",
}
html = requests.get(tweet_url, headers=headers)
assert (
html.status_code == 200
), f"Failed to get tweet page. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Status code: {html.status_code}. Tweet url: {tweet_url}"
mainjs_url = re.findall(
r"https://abs.twimg.com/responsive-web/client-web-legacy/main.[^\.]+.js",
html.text,
)
assert (
mainjs_url is not None and len(mainjs_url) > 0
), f"Failed to find main.js file. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Tweet url: {tweet_url}"
mainjs_url = mainjs_url[0]
mainjs = requests.get(mainjs_url)
assert (
mainjs.status_code == 200
), f"Failed to get main.js file. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Status code: {mainjs.status_code}. Tweet url: {tweet_url}"
bearer_token = re.findall(r'AAAAAAAAA[^"]+', mainjs.text)
assert (
bearer_token is not None and len(bearer_token) > 0
), f"Failed to find bearer token. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Tweet url: {tweet_url}, main.js url: {mainjs_url}"
bearer_token = bearer_token[0]
# get the guest token
with requests.Session() as s:
s.headers.update(
{
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0",
"accept": "*/*",
"accept-language": "de,en-US;q=0.7,en;q=0.3",
"accept-encoding": "gzip, deflate, br",
"te": "trailers",
}
)
s.headers.update({"authorization": f"Bearer {bearer_token}"})
# activate bearer token and get guest token
guest_token = s.post("https://api.twitter.com/1.1/guest/activate.json").json()[
"guest_token"
]
assert (
guest_token is not None
), f"Failed to find guest token. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Tweet url: {tweet_url}, main.js url: {mainjs_url}"
return bearer_token, guest_token
def get_details_url(tweet_id, features, variables):
# create a copy of variables - we don't want to modify the original
variables = {**variables}
variables["tweetId"] = tweet_id
return f"https://twitter.com/i/api/graphql/0hWvDhmW8YQ-S_ib3azIrw/TweetResultByRestId?variables={urllib.parse.quote(json.dumps(variables))}&features={urllib.parse.quote(json.dumps(features))}"
# return f"https://api.twitter.com/graphql/ncDeACNGIApPMaqGVuF_rw/TweetResultByRestId?variables={urllib.parse.quote(json.dumps(variables))}&features={urllib.parse.quote(json.dumps(features))}"
def get_tweet_details(tweet_url, guest_token, bearer_token):
tweet_id = re.findall(r"(?<=status/)\d+", tweet_url)
assert (
tweet_id is not None and len(tweet_id) == 1
), f"Could not parse tweet id from your url. Make sure you are using the correct url. If you are, then file a GitHub issue and copy and paste this message. Tweet url: {tweet_url}"
tweet_id = tweet_id[0]
# the url needs a url encoded version of variables and features as a query string
url = get_details_url(tweet_id, features, variables)
details = requests.get(
url,
headers={
"authorization": f"Bearer {bearer_token}",
"x-guest-token": guest_token,
},
)
max_retries = 10
cur_retry = 0
while details.status_code == 400 and cur_retry < max_retries:
try:
error_json = json.loads(details.text)
except json.JSONDecodeError:
assert (
False
), f"Failed to parse json from details error. details text: {details.text} If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Status code: {details.status_code}. Tweet url: {tweet_url}"
assert (
"errors" in error_json
), f"Failed to find errors in details error json. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Status code: {details.status_code}. Tweet url: {tweet_url}"
needed_variable_pattern = re.compile(r"Variable '([^']+)'")
needed_features_pattern = re.compile(
r'The following features cannot be null: ([^"]+)'
)
for error in error_json["errors"]:
needed_vars = needed_variable_pattern.findall(error["message"])
for needed_var in needed_vars:
variables[needed_var] = True
needed_features = needed_features_pattern.findall(error["message"])
for nf in needed_features:
for feature in nf.split(","):
features[feature.strip()] = True
url = get_details_url(tweet_id, features, variables)
details = requests.get(
url,
headers={
"authorization": f"Bearer {bearer_token}",
"x-guest-token": guest_token,
},
)
cur_retry += 1
if details.status_code == 200:
# save new variables
request_details["variables"] = variables
request_details["features"] = features
with open(request_details_file, "w") as f:
json.dump(request_details, f, indent=4)
assert (
details.status_code == 200
), f"Failed to get tweet details. If you are using the correct Twitter URL this suggests a bug in the script. Please open a GitHub issue and copy and paste this message. Status code: {details.status_code}. Tweet url: {tweet_url}"
return details
def get_tweet_status_id(tweet_url):
sid_patern = r"https://x\.com/[^/]+/status/(\d+)"
if tweet_url[len(tweet_url) - 1] != "/":
tweet_url = tweet_url + "/"
match = re.findall(sid_patern, tweet_url)
if len(match) == 0:
print("error, could not get status id from this tweet url :", tweet_url)
exit()
status_id = match[0]
return status_id
def get_associated_media_id(j, tweet_url):
sid = get_tweet_status_id(tweet_url)
pattern = (
r'"expanded_url"\s*:\s*"https://x\.com/[^/]+/status/'
+ sid
+ r'/[^"]+",\s*"id_str"\s*:\s*"\d+",'
)
matches = re.findall(pattern, j)
if len(matches) > 0:
target = matches[0]
target = target[0: len(target) - 1] # remove the coma at the end
return json.loads("{" + target + "}")["id_str"]
return None
def extract_mp4s(j, tweet_url, target_all_mp4s=False):
# pattern looks like https://video.twimg.com/amplify_video/1638969830442237953/vid/1080x1920/lXSFa54mAVp7KHim.mp4?tag=16 or https://video.twimg.com/ext_tw_video/1451958820348080133/pu/vid/720x1280/GddnMJ7KszCQQFvA.mp4?tag=12
amplitude_pattern = re.compile(
r"(https://video.twimg.com/amplify_video/(\d+)/vid/(\d+x\d+)/[^.]+.mp4\?tag=\d+)"
)
ext_tw_pattern = re.compile(
r"(https://video.twimg.com/ext_tw_video/(\d+)/pu/vid/(avc1/)?(\d+x\d+)/[^.]+.mp4\?tag=\d+)"
)
# format - https://video.twimg.com/tweet_video/Fvh6brqWAAQhU9p.mp4
tweet_video_pattern = re.compile(r'https://video.twimg.com/tweet_video/[^"]+')
# https://video.twimg.com/ext_tw_video/1451958820348080133/pu/pl/b-CiC-gZClIwXgDz.m3u8?tag=12&container=fmp4
container_pattern = re.compile(r'https://video.twimg.com/[^"]*container=fmp4')
media_id = get_associated_media_id(j, tweet_url)
# find all the matches
matches = amplitude_pattern.findall(j)
matches += ext_tw_pattern.findall(j)
container_matches = container_pattern.findall(j)
tweet_video_matches = tweet_video_pattern.findall(j)
if len(matches) == 0 and len(tweet_video_matches) > 0:
return tweet_video_matches
results = {}
for match in matches:
url, tweet_id, _, resolution = match
if tweet_id not in results:
results[tweet_id] = {"resolution": resolution, "url": url}
else:
# if we already have a higher resolution video, then don't overwrite it
my_dims = [int(x) for x in resolution.split("x")]
their_dims = [int(x) for x in results[tweet_id]["resolution"].split("x")]
if my_dims[0] * my_dims[1] > their_dims[0] * their_dims[1]:
results[tweet_id] = {"resolution": resolution, "url": url}
if media_id:
all_urls = []
for twid in results:
all_urls.append(results[twid]["url"])
all_urls += container_matches
url_with_media_id = []
for url in all_urls:
if url.__contains__(media_id):
url_with_media_id.append(url)
if len(url_with_media_id) > 0:
return url_with_media_id
if len(container_matches) > 0 and not target_all_mp4s:
return container_matches
if target_all_mp4s:
urls = [x["url"] for x in results.values()]
urls += container_matches
return urls
return [x["url"] for x in results.values()]
def extract_mp4_fmp4(j):
"""
Extract the URL of the MP4 video from the detailed information of the tweet.
Returns a list of URLs, tweet IDs, and resolution information (dictionary type)
and a list of tweet IDs as return values.
"""
# Empty list to store tweet IDs
tweet_id_list = []
mp4_info_dict_list = []
amplitude_pattern = re.compile(
r"(https://video.twimg.com/amplify_video/(\d+)/vid/(avc1/)(\d+x\d+)/[^.]+.mp4\?tag=\d+)"
)
ext_tw_pattern = re.compile(
r"(https://video.twimg.com/ext_tw_video/(\d+)/pu/vid/(avc1/)?(\d+x\d+)/[^.]+.mp4\?tag=\d+)"
)
tweet_video_pattern = re.compile(r'https://video.twimg.com/tweet_video/[^"]+')
container_pattern = re.compile(r'https://video.twimg.com/[^"]*container=fmp4')
matches = amplitude_pattern.findall(j)
matches += ext_tw_pattern.findall(j)
container_matches = container_pattern.findall(j)
tweet_video_url_list = tweet_video_pattern.findall(j)
for match in matches:
url, tweet_id, _, resolution = match
tweet_id_list.append(int(tweet_id))
mp4_info_dict_list.append({"resolution": resolution, "url": url})
tweet_id_list = list(dict.fromkeys(tweet_id_list))
if len(container_matches) > 0:
for url in container_matches:
mp4_info_dict_list.append({"url": url})
return tweet_id_list, mp4_info_dict_list, tweet_video_url_list
def download_parts(url, output_filename):
resp = requests.get(url, stream=True)
pattern = re.compile(r"(/[^\n]*/(\d+x\d+)/[^\n]*container=fmp4)")
matches = pattern.findall(resp.text)
max_res = 0
max_res_url = None
for match in matches:
url, resolution = match
width, height = resolution.split("x")
res = int(width) * int(height)
if res > max_res:
max_res = res
max_res_url = url
assert (
max_res_url is not None
), f"Could not find a url to download from. Make sure you are using the correct url. If you are, then file a GitHub issue and copy and paste this message. Tweet url: {url}"
video_part_prefix = "https://video.twimg.com"
resp = requests.get(video_part_prefix + max_res_url, stream=True)
mp4_pattern = re.compile(r"(/[^\n]*\.mp4)")
mp4_parts = mp4_pattern.findall(resp.text)
assert (
len(mp4_parts) == 1
), f"There should be exactly 1 mp4 container at this point. Instead, found {len(mp4_parts)}. Please open a GitHub issue and copy and paste this message into it. Tweet url: {url}"
mp4_url = video_part_prefix + mp4_parts[0]
m4s_part_pattern = re.compile(r"(/[^\n]*\.m4s)")
m4s_parts = m4s_part_pattern.findall(resp.text)
with open(output_filename, "wb") as f:
r = requests.get(mp4_url, stream=True)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
for part in m4s_parts:
part_url = video_part_prefix + part
r = requests.get(part_url, stream=True)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
return True
def repost_check(j, exclude_replies=True):
try:
reply_index = j.index('"conversationthread-')
except ValueError:
reply_index = len(j)
if exclude_replies:
j = j[0:reply_index]
# We use this regular expression to extract the source status
source_status_pattern = r'"source_status_id_str"\s*:\s*"\d+"'
matches = re.findall(source_status_pattern, j)
if len(matches) > 0 and exclude_replies:
# We extract the source status id (ssid)
ssid = json.loads("{" + matches[0] + "}")["source_status_id_str"]
# We plug it in this regular expression to find expanded_url (the original tweet url)
expanded_url_pattern = (
r'"expanded_url"\s*:\s*"https://x\.com/[^/]+/status/' + ssid + '[^"]+"'
)
matches2 = re.findall(expanded_url_pattern, j)
if len(matches2) > 0:
# We extract the url and return it
status_url = json.loads("{" + matches2[0] + "}")["expanded_url"]
return status_url
if not exclude_replies:
# If we include replies we'll have to get all ssids and remove duplicates
ssids = []
for match in matches:
ssids.append(json.loads("{" + match + "}")["source_status_id_str"])
# we remove duplicates (this line is messy but it's the easiest way to do it)
ssids = list(set(ssids))
if len(ssids) > 0:
for ssid in ssids:
expanded_url_pattern = (
r'"expanded_url"\s*:\s*"https://x\.com/[^/]+/status/'
+ ssid
+ '[^"]+"'
)
matches2 = re.findall(expanded_url_pattern, j)
if len(matches2) > 0:
status_urls = []
for match in matches2:
status_urls.append(
json.loads("{" + match + "}")["expanded_url"]
)
# We remove duplicates another time
status_urls = list(set(status_urls))
return status_urls
# If we don't find source_status_id_str, the tweet doesn't feature a reposted video
return None
def download_video_from_x(tweet_url, output_file, target_all_videos=False):
bearer_token, guest_token = get_tokens(tweet_url)
resp = get_tweet_details(tweet_url, guest_token, bearer_token)
mp4s = extract_mp4s(resp.text, tweet_url, target_all_videos)
if target_all_videos:
video_counter = 1
original_urls = repost_check(resp.text, exclude_replies=False)
if len(original_urls) > 0:
for url in original_urls:
download_video_from_x(
url, output_file.replace(".mp4", f"_{video_counter}.mp4")
)
video_counter += 1
if len(mp4s) > 0:
for mp4 in mp4s:
output_file = output_file.replace(".mp4", f"_{video_counter}.mp4")
if "container" in mp4:
download_parts(mp4, output_file)
else:
# use a stream to download the file
r = requests.get(mp4, stream=True)
with open(output_file, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
video_counter += 1
else:
original_url = repost_check(resp.text)
if original_url:
download_video_from_x(original_url, output_file)
else:
assert (
len(mp4s) > 0
), f"Could not find any mp4s to download. Make sure you are using the correct url. If you are, then file a GitHub issue and copy and paste this message. Tweet url: {tweet_url}"
mp4 = mp4s[0]
if "container" in mp4:
download_parts(mp4, output_file)
else:
# use a stream to download the file
r = requests.get(mp4, stream=True)
with open(output_file, "wb") as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk:
f.write(chunk)
f.flush()
return target_location
return download_video_from_x(source_url, target_location)
# TIKTOK/INSTA
def getDict() -> dict:
response = requests.get('https://ttdownloader.com/')
point = response.text.find('<input type="hidden" id="token" name="token" value="') + \
len('<input type="hidden" id="token" name="token" value="')
token = response.text[point:point + 64]
TTDict = {
'token': token,
}
for i in response.cookies:
TTDict[str(i).split()[1].split('=')[0].strip()] = str(
i).split()[1].split('=')[1].strip()
return TTDict
def createHeader(parseDict) -> tuple[dict[str, Any], dict[str | Any, str | Any], dict[str, str | Any]]:
cookies = {
'PHPSESSID': parseDict['PHPSESSID'],
# 'popCookie': parseDict['popCookie'],
}
headers = {
'authority': 'ttdownloader.com',
'accept': '*/*',
'accept-language': 'en-US,en;q=0.9',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'origin': 'https://ttdownloader.com',
'referer': 'https://ttdownloader.com/',
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/108.0.0.0 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
data = {
'url': '',
'format': '',
'token': parseDict['token'],
}
return cookies, headers, data
def TikTokDownload(cookies, headers, data, name, path) -> str:
response = requests.post('https://ttdownloader.com/search/',
cookies=cookies, headers=headers, data=data)
parsed_link = [i for i in str(response.text).split()
if i.startswith("href=")][0]
response = requests.get(parsed_link[6:-10])
with open(path + "\\" + name + ".mp4", "wb") as f:
f.write(response.content)
return path + "\\" + name + ".mp4"
def TiktokDownloadAll(linkList, path) -> str:
parseDict = getDict()
cookies, headers, data = createHeader(parseDict)
# linkList = getLinkDict()['tiktok']
for i in linkList:
try:
data['url'] = i
result = TikTokDownload(cookies, headers, data, "tiktok", path) # str(linkList.index(i))
return result
except IndexError:
parseDict = getDict()
cookies, headers, data = createHeader(parseDict)
except Exception as err:
print(err)
exit(1)
def InstagramDownload(url, name, path) -> str:
obj = instaloader.Instaloader()
post = instaloader.Post.from_shortcode(obj.context, url.split("/")[-2])
photo_url = post.url
video_url = post.video_url
print(video_url)
if video_url:
response = requests.get(video_url)
with open(path + "\\" + name + ".mp4", "wb") as f:
f.write(response.content)
return path + "\\" + name + ".mp4"
elif photo_url:
response = requests.get(photo_url)
with open(path + "\\" + name + ".jpg", "wb") as f:
f.write(response.content)
return path + "\\" + name + ".jpg"
def InstagramDownloadAll(linklist, path) -> str:
for i in linklist:
try:
print(str(linklist.index(i)))
print(str(linklist[i]))
result = InstagramDownload(i, str(linklist.index(i)), path)
return result
except Exception as err:
print(err)
exit(1)
# YOUTUBE
def YouTubeDownload(link, path, audio_only=True):
youtubeObject = YouTube(link)
if audio_only:
youtubeObject = youtubeObject.streams.get_audio_only()
youtubeObject.download(path, "yt.mp3")
print("Download is completed successfully")
return path + "yt.mp3"
else:
youtubeObject = youtubeObject.streams.get_highest_resolution()
youtubeObject.download(path, "yt.mp4")
print("Download is completed successfully")
return path + "yt.mp4"
def checkYoutubeLinkValid(link):
try:
# TODO find a way to test without fully downloading the file
youtubeObject = YouTube(link)
youtubeObject = youtubeObject.streams.get_audio_only()
youtubeObject.download(".", "yt.mp3")
os.remove("yt.mp3")
return True
except Exception as e:
print(str(e))
return False
# OVERCAST
def OvercastDownload(source_url, target_location):
def get_title(html_str):
"""Get the title from the meta tags"""
title = re.findall(r"<meta name=\"og:title\" content=\"(.+)\"", html_str)
if len(title) == 1:
return title[0].replace("&mdash;", "-")
return None
def get_description(html_str):
"""Get the description from the Meta tag"""
desc_re = r"<meta name=\"og:description\" content=\"(.+)\""
description = re.findall(desc_re, html_str)
if len(description) == 1:
return description[0]
return None
def get_url(html_string):
"""Find the URL from the <audio><source>.... tag"""
url = re.findall(r"<source src=\"(.+?)\"", html_string)
if len(url) == 1:
# strip off the last 4 characters to cater for the #t=0 in the URL
# which urlretrieve flags as invalid
return url[0][:-4]
return None
"""Given a Overcast source URL fetch the file it points to"""
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
"AppleWebKit/537.11 (KHTML, like Gecko) "
"Chrome/23.0.1271.64 Safari/537.11",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"Accept-Encoding": "none",
"Accept-Language": "en-US,en;q=0.8",
"Connection": "keep-alive",
}
req = Request(source_url, None, headers)
source_data = urlopen(req).read().decode('utf-8')
title = get_title(source_data)
url = get_url(source_data)
if url is None or title is None:
sys.exit("Could not find parse URL")
if not os.path.exists(target_location):
req = requests.get(url)
file = open(target_location, 'wb')
for chunk in req.iter_content(100000):
file.write(chunk)
file.close()