在github看到有个解析p站视频链接视频地址的python脚本,使用requests-html、Pyppeteer获取动态渲染JS的内容,解析出mp4_url。

本身的解析函数def:

def get_video(v_url):
    session = HTMLSession()
    r = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=10)
    html = r.text
    bs = BeautifulSoup(html, 'html.parser')
    script = bs.find('div', class_='original mainPlayerDiv').find('script').string
    script = script.strip()
    var_name = re.findall('var flashvars_(.*) =', script)[0]



    js = f"""
    () => {{
    var playerObjList = {{}}
    {script}
    var num = flashvars_{var_name}['mediaDefinitions'].length - 1
    while (flashvars_{var_name}['mediaDefinitions'][num]['format'] != "mp4")
    {{
        num -= 1
    }}
    return flashvars_{var_name}['mediaDefinitions'][num]['videoUrl']
    }}
    """

    r.html.render(script=js, timeout = 100, retries = 5)
    time.sleep(2)
    s = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=5).text
    js_data = re.findall('= media_\d;(var .*?media_\d.*?;)', s)

    urls = []
    for i, j in enumerate(js_data):
        js = 'function test(a){ ' + j + 'return media_' + str(i + 1) + ';}'
        ss = execjs.compile(js)
        x_nul = ss.call('test', '1')
        urls.append(x_nul)

    nul = urls[-1]
    video_url = ''

    count = 0
    while video_url == '':
        video_url = session.get(nul, cookies=cookies, headers=headers, proxies=proxy, timeout=5).json()[count - 1][
            'videoUrl']
        count = count - 1
        if count < -3:
            break
    return video_url

在次脚本基础上改动了下,解析博主所有页面中的所有视频链接,然后调用该def解析每个页面的视频url。完成脚本如下:

# -*- coding: utf-8 -*-

import os
import re
import time
import execjs
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from subprocess import call


headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36"
}
cookies = {
    'accessAgeDisclaimerPH': '1',
    'hasVisited': '1',
    'd_fs': '1',
   '_gat': '1'
}

proxy={'http': 'socks5h://127.0.0.1:1080',
       'https': 'socks5h://127.0.0.1:1080'}


def IDMdown(DownUrl, DownPath, FileName):
    IDMPath = "C:\\Users\\Administrator\\Documents\\Inet Download Manager 6.38.2"
    os.chdir(IDMPath)
    IDM = "IDMan.exe"
    call([IDM, '/d', DownUrl, '/p', DownPath, '/f', FileName, '/a'])
    call([IDM, '/s'])


def get_video(v_url):
    session = HTMLSession()
    r = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=10)
    html = r.text
    bs = BeautifulSoup(html, 'html.parser')
    script = bs.find('div', class_='original mainPlayerDiv').find('script').string
    script = script.strip()
    var_name = re.findall('var flashvars_(.*) =', script)[0]



    js = f"""
    () => {{
    var playerObjList = {{}}
    {script}
    var num = flashvars_{var_name}['mediaDefinitions'].length - 1
    while (flashvars_{var_name}['mediaDefinitions'][num]['format'] != "mp4")
    {{
        num -= 1
    }}
    return flashvars_{var_name}['mediaDefinitions'][num]['videoUrl']
    }}
    """

    r.html.render(script=js, timeout = 100, retries = 5)
    time.sleep(2)
    s = session.get(v_url, cookies=cookies, headers=headers, proxies=proxy, timeout=5).text
    js_data = re.findall('= media_\d;(var .*?media_\d.*?;)', s)

    urls = []
    for i, j in enumerate(js_data):
        js = 'function test(a){ ' + j + 'return media_' + str(i + 1) + ';}'
        ss = execjs.compile(js)
        x_nul = ss.call('test', '1')
        urls.append(x_nul)

    nul = urls[-1]
    video_url = ''

    count = 0
    while video_url == '':
        video_url = session.get(nul, cookies=cookies, headers=headers, proxies=proxy, timeout=5).json()[count - 1][
            'videoUrl']
        count = count - 1
        if count < -3:
            break
    return video_url




def get_pages(username):
    user_page = 'https://pornhub.com/model/' + str(username) + '/videos'
    print('[+]Blogger page:  ' + user_page)
    html = requests.get(user_page,headers=headers,proxies=proxy,timeout=5).text
    #get videos count.
    page_info = re.findall('class="showingInfo" >(.*?) </div>', html)
    page_info = str(page_info[0]).split('of')
    viedo_counts = int(page_info[1].strip())
    print('[+]Viedo_counts:  ' + str(viedo_counts))

    #获取页数
    page_counts = viedo_counts // 40 + 1
    page_counts = page_counts + 1 if page_counts < 2 else page_counts
    print('[+]Page_Counts:  ' + str(page_counts))
    for x in range(1,page_counts):
        video_page = 'https://pornhub.com/model/'+ username + '/videos?page=' + str(x)
        print('[+]Current page:  ' + video_page)
        html1 = requests.get(video_page,headers=headers,proxies=proxy,timeout=5).text
        soup = BeautifulSoup(html1, 'html.parser')
        soup1 = str(soup.find_all("div", class_='profileContentLeft'))
        video_keys = re.findall('data-video-vkey="(.*?)" id', soup1)
        for v in video_keys:
            mp4_file = f"{v}.mp4"
            file_path = f"C:\\Users\\Administrator\\Downloads\\{username}"
            #print(file_path + '\\' + mp4_file)
            if os.path.exists(f"{file_path}\\{mp4_file}"):
                print('[+]文件存在,跳过!')
                pass
            else:
                try:
                    print(f"[+]当前任务:{v},开始解析!")
                    vs = get_video('https://pornhub.com/' + 'view_video.php?viewkey=' + v)
                    print(f"[+]Page: {v}已添加下载任务!")
                    IDMdown(vs, file_path, mp4_file)
                except Exception as e:
                    print(e)
                    pass
        #sleep秒数,避免idm任务拥挤
        time.sleep(400)


username = '博主名'

if __name__ == '__main__':
    get_pages(username)

测试demo,文件保存路径手动创建在"C:\Users\Administrator\Downloads\博主名"

还有一点是,在解析视频地址需要走代理,小飞机开全局即可。