文章文字加密案例，非python重写js

最新推荐文章于 2020-08-27 17:56:08 发布

_JackSparrow

最新推荐文章于 2020-08-27 17:56:08 发布

阅读量231

点赞数 2

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/qq_42709514/article/details/96984527

本文介绍了一种从红袖小说网站抓取特定小说章节内容的方法。通过发送HTTP请求并解析返回的数据，利用正则表达式和JavaScript代码处理来提取加密的文章内容。最终将解密后的文章内容保存为本地文件。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

只是保留自己的开发成果，并非搞技术含量代码

# -*- coding: utf-8 -*-
# @Time : 2019/7/23 上午7:36
# @Author : _Hebel
# @Site : 
# @File : get_article_data.py
# @Software: PyCharm

import re
import requests
import js2py


def collect_news_data(url):
    """初始请求数据"""
    headers = {
        'content-type': "text/html",
        'accept-encoding': "gzip, deflate, br",
        'accept-language': "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7",
        'cache-control': "max-age=0",
        'upgrade-insecure-requests': "1",
        'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36",
    }
    resp = requests.get(url,headers=headers)
    if resp.status_code == 200:
        return resp.text
    else:
        return False


def dispose_jscode(html):
    """合并完整js代码"""
    javascript_code = re.findall('(var CryptoJS.*?)</script>', html, re.S)
    if not javascript_code:
        return False
    for_code = joint_compile(javascript_code[0])
    encryption_code = joint_encryption(javascript_code[0])
    if for_code and encryption_code:
        js_code = encryption_code + for_code
        return js_code
    return False


def joint_encryption(js_code):
    """重组js加密算法代码"""
    encode_code_list = re.findall(r"(var CryptoJS.+?new Array\(secWords.+?;)", js_code, re.S)
    if not encode_code_list or len(encode_code_list[0])<15000:
        return False
    iv_code_list = re.findall(r"(var iv=.+)", encode_code_list[0], re.S)
    if not iv_code_list or len(iv_code_list[0])< 300:
        return False
    if "window" in iv_code_list[0]:
        iv_code = iv_code_list[0]
        pattern = re.compile(r"(if.+?\})")
        pattern.findall(iv_code)
        code = re.sub(pattern, '', iv_code)
        pattern = re.compile(r"(var iv=.+)")
        pattern.findall(iv_code)
        encryption_code = re.sub(pattern, code, encode_code_list[0])
        return encryption_code
    return encode_code_list[0]



def joint_compile(js_code):
    """ 重组文字生产js代码"""
    for_code_list = re.findall(r"(for\(var i=0x0;i<secWords.+)for\(", js_code, re.S)
    if not for_code_list:
        return False
    str_code = for_code_list[0]
    if "typeof document===" in str_code:
        if re.findall(r"(typeof document===\w+?\('\w+?'\))",str_code):
            pattern = re.compile(r"(typeof document===\w+?\('\w+?'\))")
            pattern.findall(str_code)
            code = re.sub(pattern, 'false', str_code)
        elif re.findall(r"(typeof document===\w+?\[\w+?\('\w+?'\)\])",str_code):
            pattern = re.compile(r"(typeof document===\w+?\[\w+?\('\w+?'\)\])")
            pattern.findall(str_code)
            code = re.sub(pattern, 'false', str_code)
        else:
            return False
    elif "typeof document" in str_code:
        code = str_code.replace("typeof document,","")
    else:
        code = str_code
    result = "var encode_kw = function () {" + code + ";return words};"
    return result


def group_kw(words):
    """关键文字关系对应"""
    kw = {}
    for i in range(len(words)):
        kw["context_kw{num}".format(num=i)] = words[i]
    return kw


def regroup_article(article_html, words, file_name):
    """重组文章"""
    for k,v in words.items():
        article_html = article_html.replace('<span class="' + k + '"></span>',v)
    article_content = article_html.replace("</p>","").replace("<p>","")
    #todo ================================ 获取的结果当前无法print显示，保存文件即可查看
    with open("./{file_name}.txt".format(file_name=file_name),"w") as f:
        f.write(article_content)


def Run(url):

    html = collect_news_data(url)
    article_html_list = re.findall(r'<div class="rdtext" fsize="16">(.+?)</div>', html, re.S)
    if not article_html_list:
        print("获取文章片段失败")
        return
    js_code = dispose_jscode(html)
    if not js_code:
        print("js处理失败")
        return
    words = js2py.eval_js(js_code)('encode_kw')
    words_dict = group_kw(words)
    file_name = url.replace(".html","").split("/")[-1]
    regroup_article(article_html_list[0],words_dict, file_name)

if __name__ == '__main__':
    # url = "https://g.hongshu.com/content/98722/15255548.html"
    # url = "https://g.hongshu.com/content/93717/13904933.html"
    url = "https://g.hongshu.com/content/93416/13877912.html"
    # url = "https://g.hongshu.com/content/98117/15132711.html"
    # url = "https://g.hongshu.com/content/91351/13430130.html"
    # url = "https://g.hongshu.com/content/91307/13410516.html"
    # url = "https://g.hongshu.com/content/95075/14182937.html"

    Run(url)