#!/usr/bin/env python3
# encoding: utf-8
import base64
import datetime
from urllib.parse import quote
import requests
from bs4 import BeautifulSoup
session = requests.session()
def crawl(url,file_name, daynum):
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9,ja;q=0.8,en;q=0.7",
"Host": "www.tvmao.com",
"Pragma": "no-cache",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
}
# 节目单的上半部分没有加密,这里不再解析
html = session.get(url, headers=headers).text
# for debug
#with open("./response-01.html", "w", encoding="UTF-8") as f:
# f.write(html)
p = get_param_p(html)
#print(f"计算出 p = {p}")
headers["Referer"] = url
headers["X-Requested-With"] = "XMLHttpRequest"
url = "https://www.tvmao.com/api/pg?p=" + quote(p)
response = session.get(url, headers=headers).json()
# for debug
# with open("./response-02.html", "w", encoding="UTF-8") as f:
# f.write(response[1])
with open(file_name, "a", encoding="UTF-8") as fwt:
fwt.write('周'+daynum+'\n')
with open("./response-01.html", "r", encoding="UTF-8") as f:
soup=BeautifulSoup(f.read(),'lxml')
for i in soup.select('#pgrow > li > div > span.p_show > a'):
print(i.text)
fwt.write(i.text+'\n')
with open("./response-02.html", "r", encoding="UTF-8") as f:
soup=BeautifulSoup(f.read(),'lxml')
for i in soup.select('body > li > div > span.p_show > a'):
print(i.text)
fwt.write(i.text+'\n')
def get_param_p(html):
doc = BeautifulSoup(html, features="html.parser")
form = doc.select_one("form")
d = datetime.datetime.now()
week = d.weekday() - 1
if week == 0:
week = 7
week = week * week
f = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="[week]
x = form.select_one("button[type=submit]")["id"]
t1 = b64_s(x + "|" + form["a"])
v = b64_s("|" + form["q"])
return f + t1 + v
def b64_s(s):
"""
各种算算看的晕晕,为了避免混淆视听,将不重要内容尽量缩短
:param s:
:return:
"""
return base64.b64encode(s.encode("UTF-8")).decode("UTF-8")
if __name__ == "__main__":
# name = '珠江频道,广东体育频道,广东公共频道,珠江频道海外版,广东新闻频道,嘉佳卡通频道,广东国际频道'.split(',')
# url = 'GDTV-GDTV2,GDTV-GDTV3,GDTV-GDTV4,GDTV-GDTV5,GDTV-GDTV6,GDTV-GDTV7,GDTV-GDTV8'.split(',')
name='江苏综艺频道,江苏城市频道,江苏影视频道,江苏靓妆频道,江苏休闲体育频道,江苏优漫卡通频道,江苏公共·新闻,江苏教育电视台,江苏国际频道,江苏学习频道'.split(',')
url = 'JSTV-JSTV2,JSTV-JSTV3,JSTV-JSTV4,JSTV-JSTV5,JSTV-JSTV6,JSTV-JSTV7,JSTV-JSTV8,JSTV-JSTV9,JSTV-JSTV10,JSTV-JETVZK'.split(',')
for j in range(len(name)):
for i in range(1,8):
file_name = r'./epg/province/'+name[j]+'.txt'
daynum = str(i)
# crawl("https://www.tvmao.com/program/CCTV-CCTV1-w"+str(i)+".html", file_name, daynum)
crawl("https://www.tvmao.com/program/"+str(url[j])+'-w'+str(i)+".html", file_name, daynum)
12-21
1604
