在github看到有个解析p站视频链接视频地址的python脚本,使用requests-html、Pyppeteer获取动态渲染JS的内容,解析出mp4_url。
本身的解析函数def:
def get_video(v_url):
session = HTMLSession()
r = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=10)
html = r.text
bs = BeautifulSoup(html, 'html.parser')
script = bs.find('div', class_='original mainPlayerDiv').find('script').string
script = script.strip()
var_name = re.findall('var flashvars_(.*) =', script)[0]
js = f"""
() => {{
var playerObjList = {{}}
{script}
var num = flashvars_{var_name}['mediaDefinitions'].length - 1
while (flashvars_{var_name}['mediaDefinitions'][num]['format'] != "mp4")
{{
num -= 1
}}
return flashvars_{var_name}['mediaDefinitions'][num]['videoUrl']
}}
"""
r.html.render(script=js, timeout = 100, retries = 5)
time.sleep(2)
s = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=5).text
js_data = re.findall('= media_\d;(var .*?media_\d.*?;)', s)
urls = []
for i, j in enumerate(js_data):
js = 'function test(a){ ' + j + 'return media_' + str(i + 1) + ';}'
ss = execjs.compile(js)
x_nul = ss.call('test', '1')
urls.append(x_nul)
nul = urls[-1]
video_url = ''
count = 0
while video_url == '':
video_url = session.get(nul, cookies=cookies, headers=headers, proxies=proxy, timeout=5).json()[count - 1][
'videoUrl']
count = count - 1
if count < -3:
break
return video_url
在次脚本基础上改动了下,解析博主所有页面中的所有视频链接,然后调用该def解析每个页面的视频url。完成脚本如下:
# -*- coding: utf-8 -*-
import os
import re
import time
import execjs
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from subprocess import call
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
}
cookies = {
'accessAgeDisclaimerPH': '1',
'hasVisited': '1',
'd_fs': '1',
'_gat': '1'
}
proxy={'http': 'socks5h://127.0.0.1:1080',
'https': 'socks5h://127.0.0.1:1080'}
def IDMdown(DownUrl, DownPath, FileName):
IDMPath = "C:\\Users\\Administrator\\Documents\\Inet Download Manager 6.38.2"
os.chdir(IDMPath)
IDM = "IDMan.exe"
call([IDM, '/d', DownUrl, '/p', DownPath, '/f', FileName, '/a'])
call([IDM, '/s'])
def get_video(v_url):
session = HTMLSession()
r = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=10)
html = r.text
bs = BeautifulSoup(html, 'html.parser')
script = bs.find('div', class_='original mainPlayerDiv').find('script').string
script = script.strip()
var_name = re.findall('var flashvars_(.*) =', script)[0]
js = f"""
() => {{
var playerObjList = {{}}
{script}
var num = flashvars_{var_name}['mediaDefinitions'].length - 1
while (flashvars_{var_name}['mediaDefinitions'][num]['format'] != "mp4")
{{
num -= 1
}}
return flashvars_{var_name}['mediaDefinitions'][num]['videoUrl']
}}
"""
r.html.render(script=js, timeout = 100, retries = 5)
time.sleep(2)
s = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=5).text
js_data = re.findall('= media_\d;(var .*?media_\d.*?;)', s)
urls = []
for i, j in enumerate(js_data):
js = 'function test(a){ ' + j + 'return media_' + str(i + 1) + ';}'
ss = execjs.compile(js)
x_nul = ss.call('test', '1')
urls.append(x_nul)
nul = urls[-1]
video_url = ''
count = 0
while video_url == '':
video_url = session.get(nul, cookies=cookies, headers=headers, proxies=proxy, timeout=5).json()[count - 1][
'videoUrl']
count = count - 1
if count < -3:
break
return video_url
def get_pages(username):
user_page = 'https://pornhub.com/model/' + str(username) + '/videos'
print('[+]Blogger page: ' + user_page)
html = requests.get(user_page,headers=headers,proxies=proxy,timeout=5).text
#get videos count.
page_info = re.findall('class="showingInfo" >(.*?) </div>', html)
page_info = str(page_info[0]).split('of')
viedo_counts = int(page_info[1].strip())
print('[+]Viedo_counts: ' + str(viedo_counts))
#获取页数
page_counts = viedo_counts // 40 + 1
page_counts = page_counts + 1 if page_counts < 2 else page_counts
print('[+]Page_Counts: ' + str(page_counts))
for x in range(1,page_counts):
video_page = 'https://pornhub.com/model/'+ username + '/videos?page=' + str(x)
print('[+]Current page: ' + video_page)
html1 = requests.get(video_page,headers=headers,proxies=proxy,timeout=5).text
soup = BeautifulSoup(html1, 'html.parser')
soup1 = str(soup.find_all("div", class_='profileContentLeft'))
video_keys = re.findall('data-video-vkey="(.*?)" id', soup1)
for v in video_keys:
mp4_file = f"{v}.mp4"
file_path = f"C:\\Users\\Administrator\\Downloads\\{username}"
#print(file_path + '\\' + mp4_file)
if os.path.exists(f"{file_path}\\{mp4_file}"):
print('[+]文件存在,跳过!')
pass
else:
try:
print(f"[+]当前任务:{v},开始解析!")
vs = get_video('https://pornhub.com/' + 'view_video.php?viewkey=' + v)
print(f"[+]Page: {v}已添加下载任务!")
IDMdown(vs, file_path, mp4_file)
except Exception as e:
print(e)
pass
#sleep秒数,避免idm任务拥挤
time.sleep(400)
username = '博主名'
if __name__ == '__main__':
get_pages(username)
测试demo,文件保存路径手动创建在"C:\Users\Administrator\Downloads\博主名"
没有评论