电视猫节目单爬虫

Artra_Soong

于 2021-08-24 14:13:41 发布

阅读量1.5k

点赞数

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.youkuaiyun.com/qq_30230591/article/details/119888738

版权

python 专栏收录该内容

9 篇文章

订阅专栏

#!/usr/bin/env python3
# encoding: utf-8

import base64
import datetime
from urllib.parse import quote
 
import requests
from bs4 import BeautifulSoup
 
session = requests.session()
 
 
def crawl(url,file_name, daynum):
    headers = {
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7",
        "Host": "www.tvmao.com",
        "Pragma": "no-cache",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
    }
    # 节目单的上半部分没有加密，这里不再解析
    html = session.get(url, headers=headers).text
 
    # for debug
    #with open("./response-01.html", "w", encoding="UTF-8") as f:
    #    f.write(html)
    
    p = get_param_p(html)
    #print(f"计算出 p = {p}")
 
    headers["Referer"] = url
    headers["X-Requested-With"] = "XMLHttpRequest"
    url = "https://www.tvmao.com/api/pg?p=" + quote(p)
    response = session.get(url, headers=headers).json()
    # for debug
   # with open("./response-02.html", "w", encoding="UTF-8") as f:
   #     f.write(response[1])
    with open(file_name, "a", encoding="UTF-8") as fwt:
        fwt.write('周'+daynum+'\n')
        with open("./response-01.html", "r", encoding="UTF-8") as f:
            soup=BeautifulSoup(f.read(),'lxml')
            for i in soup.select('#pgrow > li > div > span.p_show > a'):
                print(i.text)
                fwt.write(i.text+'\n')
        with open("./response-02.html", "r", encoding="UTF-8") as f:

            soup=BeautifulSoup(f.read(),'lxml')
            for i in soup.select('body > li > div > span.p_show > a'):
                print(i.text)
                fwt.write(i.text+'\n')
        
        
 
 
def get_param_p(html):
    doc = BeautifulSoup(html, features="html.parser")
    form = doc.select_one("form")
 
    d = datetime.datetime.now()
    week = d.weekday() - 1
    if week == 0:
        week = 7
    week = week * week
    f = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="[week]
 
    x = form.select_one("button[type=submit]")["id"]
    t1 = b64_s(x + "|" + form["a"])
 
    v = b64_s("|" + form["q"])
 
    return f + t1 + v
 
 
def b64_s(s):
    """
    各种算算看的晕晕，为了避免混淆视听，将不重要内容尽量缩短
    :param s:
    :return:
    """
    return base64.b64encode(s.encode("UTF-8")).decode("UTF-8")
 
 
if __name__ == "__main__":
#     name = '珠江频道,广东体育频道,广东公共频道,珠江频道海外版,广东新闻频道,嘉佳卡通频道,广东国际频道'.split(',')
#     url = 'GDTV-GDTV2,GDTV-GDTV3,GDTV-GDTV4,GDTV-GDTV5,GDTV-GDTV6,GDTV-GDTV7,GDTV-GDTV8'.split(',')
     name='江苏综艺频道,江苏城市频道,江苏影视频道,江苏靓妆频道,江苏休闲体育频道,江苏优漫卡通频道,江苏公共·新闻,江苏教育电视台,江苏国际频道,江苏学习频道'.split(',')
     url = 'JSTV-JSTV2,JSTV-JSTV3,JSTV-JSTV4,JSTV-JSTV5,JSTV-JSTV6,JSTV-JSTV7,JSTV-JSTV8,JSTV-JSTV9,JSTV-JSTV10,JSTV-JETVZK'.split(',')
 
    for j in range(len(name)):
        for i in range(1,8):
            file_name = r'./epg/province/'+name[j]+'.txt'
            daynum = str(i)
    #         crawl("https://www.tvmao.com/program/CCTV-CCTV1-w"+str(i)+".html", file_name, daynum)
            crawl("https://www.tvmao.com/program/"+str(url[j])+'-w'+str(i)+".html", file_name, daynum)