只是保留自己的开发成果,并非搞技术含量代码
# -*- coding: utf-8 -*-
# @Time : 2019/7/23 上午7:36
# @Author : _Hebel
# @Site :
# @File : get_article_data.py
# @Software: PyCharm
import re
import requests
import js2py
def collect_news_data(url):
"""初始请求数据"""
headers = {
'content-type': "text/html",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
'cache-control': "max-age=0",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
}
resp = requests.get(url,headers=headers)
if resp.status_code == 200:
return resp.text
else:
return False
def dispose_jscode(html):
"""合并完整js代码"""
javascript_code = re.findall('(var CryptoJS.*?)</script>', html, re.S)
if not javascript_code:
return False
for_code = joint_compile(javascript_code[0])
encryption_code = joint_encryption(javascript_code[0])
if for_code and encryption_code:
js_code = encryption_code + for_code
return js_code
return False
def joint_encryption(js_code):
"""重组js加密算法代码"""
encode_code_list = re.findall(r"(var CryptoJS.+?new Array\(secWords.+?;)", js_code, re.S)
if not encode_code_list or len(encode_code_list[0])<15000:
return False
iv_code_list = re.findall(r"(var iv=.+)", encode_code_list[0], re.S)
if not iv_code_list or len(iv_code_list[0])< 300:
return False
if "window" in iv_code_list[0]:
iv_code = iv_code_list[0]
pattern = re.compile(r"(if.+?\})")
pattern.findall(iv_code)
code = re.sub(pattern, '', iv_code)
pattern = re.compile(r"(var iv=.+)")
pattern.findall(iv_code)
encryption_code = re.sub(pattern, code, encode_code_list[0])
return encryption_code
return encode_code_list[0]
def joint_compile(js_code):
""" 重组文字生产js代码"""
for_code_list = re.findall(r"(for\(var i=0x0;i<secWords.+)for\(", js_code, re.S)
if not for_code_list:
return False
str_code = for_code_list[0]
if "typeof document===" in str_code:
if re.findall(r"(typeof document===\w+?\('\w+?'\))",str_code):
pattern = re.compile(r"(typeof document===\w+?\('\w+?'\))")
pattern.findall(str_code)
code = re.sub(pattern, 'false', str_code)
elif re.findall(r"(typeof document===\w+?\[\w+?\('\w+?'\)\])",str_code):
pattern = re.compile(r"(typeof document===\w+?\[\w+?\('\w+?'\)\])")
pattern.findall(str_code)
code = re.sub(pattern, 'false', str_code)
else:
return False
elif "typeof document" in str_code:
code = str_code.replace("typeof document,","")
else:
code = str_code
result = "var encode_kw = function () {" + code + ";return words};"
return result
def group_kw(words):
"""关键文字关系对应"""
kw = {}
for i in range(len(words)):
kw["context_kw{num}".format(num=i)] = words[i]
return kw
def regroup_article(article_html, words, file_name):
"""重组文章"""
for k,v in words.items():
article_html = article_html.replace('<span class="' + k + '"></span>',v)
article_content = article_html.replace("</p>","").replace("<p>","")
#todo ================================ 获取的结果当前无法print显示,保存文件即可查看
with open("./{file_name}.txt".format(file_name=file_name),"w") as f:
f.write(article_content)
def Run(url):
html = collect_news_data(url)
article_html_list = re.findall(r'<div class="rdtext" fsize="16">(.+?)</div>', html, re.S)
if not article_html_list:
print("获取文章片段失败")
return
js_code = dispose_jscode(html)
if not js_code:
print("js处理失败")
return
words = js2py.eval_js(js_code)('encode_kw')
words_dict = group_kw(words)
file_name = url.replace(".html","").split("/")[-1]
regroup_article(article_html_list[0],words_dict, file_name)
if __name__ == '__main__':
# url = "https://g.hongshu.com/content/98722/15255548.html"
# url = "https://g.hongshu.com/content/93717/13904933.html"
url = "https://g.hongshu.com/content/93416/13877912.html"
# url = "https://g.hongshu.com/content/98117/15132711.html"
# url = "https://g.hongshu.com/content/91351/13430130.html"
# url = "https://g.hongshu.com/content/91307/13410516.html"
# url = "https://g.hongshu.com/content/95075/14182937.html"
Run(url)