Python爬取ZTBU官网公告获取及推送_如何爬取官网公告-优快云博客

import mysql.connector  # 导入库                        
import re
import urllib.request 
from lxml import etree 
import urllib3 
# coding:utf-8 
import requests 
import json 
import time  # 定时 
from datetime import datetime, timedelta  # 定时 

urllib3.disable_warnings() 

# 数据库
config = {'host': '',  # 默认127.0.0.1 
          'user': '',  # 用户名
          'password': '',  # 密码 
          'port': 3306,  # 端口，默认为3306
          'database': 'ztbu_notice',  # 数据库名称
          'charset': 'utf8mb4'  # 字符编码
          }  # 配置数据库


# cnn = mysql.connector.connect(**config)  # 建立MySQL连接 
# cursor = cnn.cursor()  # 获得游标


# 获取列表数据
def acquire_link_data() :
    cnn = mysql.connector.connect(**config)  # 建立MySQL连接 
    cursor = cnn.cursor()  # 获得游标
    sql = "select * from link_data"  # where school_id=13507"  # SQL语句 
    cursor.execute(sql)  # 执行SQL语句 
    data = cursor.fetchall()  # 通过fetchall方法获得数据
    for i in data :
        # print(i[4]) 
        link, headline, date, text = acquire_link_webdata(str(i[4]))  # 获取网页数据 
        # print(headline) 
        # 判断当前数据库所存储标题与网页获取标题是否吻合
        # acquire_notice_list_user_send(i[1], headline, link, date, text) 
        if i[5] != headline :
            # print("budeng") 
            # 更新linkdata 数据
            update_link_data(i[1], headline, date)  # 更新数据 
            # 判断开启用户 并且推送数据
            acquire_notice_list_user_send(i[1], headline, link, date, text) 
        else :
            print(f"{i[2]}，无更新！")
        # break 


# 获取用户数据
def acquire_notice_list_user_send(linkcode, headline, link, date, text) :
    cnn = mysql.connector.connect(**config)  # 建立MySQL连接 
    cursor = cnn.cursor()  # 获得游标
    # sql = f"select * from notice_list where L101 <> {content}"  # SQL语句 
    sql = f"select * from notice_list where {linkcode} = 1"  # SQL语句 
    cursor.execute(sql)  # 执行SQL语句 
    data = cursor.fetchall()  # 通过fetchall方法获得数据
    for i in data :
        # print(i) 
        massage_send(i[1], i[2], i[3], headline, link, date, text) 
        time.sleep(1) 


# 更新数据库
def update_link_data(linkcode, Title, Time) :
    cnn = mysql.connector.connect(**config)  # 建立MySQL连接 
    cursor = cnn.cursor()  # 获得游标
    sql = f"update link_data set Title='{Title}' ,Time='{Time}' where code like '%{linkcode}%'" 
    cursor.execute(sql)  # 执行SQL语句 
    cnn.commit()  # 提交数据库
    print(f"高校：{linkcode}保留数据更新成功！")
    # cnn.close() 
    return True 


# 获取网页数据
def acquire_link_webdata(url) :
    header = { 
        'Connection': 'close' ,
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36"} 
    response = requests.get(url=url, headers=header, timeout=10, verify=False) 
    response.encoding = 'utf-8' 
    notice_data = re.findall( 
        '<a href="(.*?)" target="_blank" title="(.*?)">(.*?)</a><span class="datatime">(.*?)</span>', response.text) 
    notice_data = notice_data[0] 
    link = notice_data[0] 
    headline = notice_data[1] 
    date = notice_data[3] 
    # 获取内容
    request = urllib.request.Request(url=link, headers=header) 
    # 发送请求访问服务器，返回响应对象
    response = urllib.request.urlopen(request) 
    # 解码响应对象，得到页面源码
    content = response.read().decode('utf-8') 
    # 解析服务器响应的文件
    parse_html = etree.HTML(content) 
    # 编写xpath路径，获取想要的数据,xpath的返回值是列表类型
    link_content_data = parse_html.xpath('//div[@class="layout_txtcontent_content"]//text()') 
    text = "" 
    for i in link_content_data :
        text += i 
    # 解析页面内容link部分内容,fujian
    link_data_fujian = re.findall( 
        '<a href="(.*?)" target="_blank" style="font-family: 微软雅黑; font-size: 14pt;">(.*?)</a>', content) 
    for i in link_data_fujian :
        text += f'<a href="{i[0]}">{i[1]}</a><br>' 
        text += i[1] + "\n" 
    # print(headline,link,date,text) 
    return link, headline, date, text 


# 推送
def massage_send(user_id, push_select, push_token, web_head, web_link, web_time, web_text) :
    # pushplus wxpusher kutui_push hi_push sever 
    # pushplus 
    if push_select == "1" :
        # 定义POST请求的URL
        url = "http://www.pushplus.plus/send/" 
        # 定义请求的数据
        data = { 
            "token": push_token ,
            "title": web_head ,
            "content": web_text ,
            "template": "html" 
            # "topic": web_link 
        }
        # 发送POST请求
        res = requests.post(url, data=data) 
        name = "pushplus" 
        # server_url = f'http://www.pushplus.plus/send?token={push_token}&title={web_head}&content={web_text}&template=html'  # txt/htm/json/markdown 
        # res = requests.post(server_url, timeout=10, verify=False) 
        print(f"用户：{user_id}\n状态：{res}推送成功！\n推送方式：{name}\n")
    # wxpusher 
    elif push_select == "2" :
        name = "Wxpusher" 
        p = push_token.split("#")  # 分割字符 # 格式：token#uid
        # print(p[1])
        header = { 
            "Content-Type": "application/json" ,
            "User-Agent": "PostmanRuntime/7.26.5" ,
            "Accept": "*/*" ,
            "Postman-Token": "8b253f0e-292c-4b0d-8b37-86cc344bb199" ,
            "Host": "wxpusher.zjiecode.com" ,
            "Accept-Encoding": "gzip, deflate, br" ,
            "Content-Length": "221" ,
            "Connection": "keep-alive" 
        }
        data = { 
            "appToken": p[0] ,
            "content": web_text ,
            "summary": web_head ,
            "contentType": 2,  # 内容类型 1表示文字  2表示html(只发送body标签内部的数据即可，不包括body标签) 3表示markdown
            "uids": [p[1]] ,
            "url": web_link,  # 原文链接，可选参数
            "verifyPay": False,  # //是否验证订阅时间，true表示只推送给付费订阅用户，false表示推送的时候，不验证付费，不验证用户订阅到期时间，用户订阅过期了，也能收到。
        }
        res = requests.post("https://wxpusher.zjiecode.com/api/send/message", data=json.dumps(data), headers=header ,
                            timeout=10, verify=False).json()["data"][0]["status"] 
        print(f"用户：{user_id}\n状态：{res}推送成功！\n推送方式：{name}\n")
    # kutui_push 
    elif push_select == "3" :
        name = "酷推"
        data = web_head + "\n" + web_text 
        res = requests.post(url="https://push.xuthus.cc/send/" + push_token, data=data.encode('utf-8'), timeout=10 ,
                            verify=False).text 
        print(f"用户：{user_id}\n状态：{res}推送成功！\n推送方式：{name}\n")
    # hi_push 
    elif push_select == "4" :
        name = "Hi提醒"
        server_url = f'https://api.caicenter.com/send.php?appkey={push_token}&title={web_head}&describes={web_text}&channel=1' 
        res = requests.post(server_url, timeout=10, verify=False) 
        print(f"用户：{user_id}\n状态：{res}推送成功！\n推送方式：{name}\n")
    # sever 
    elif push_select == "5" :
        name = "方糖"
        server_url = f"https://sctapi.ftqq.com/{push_token}.send?title={web_head}&desp={web_text}" 
        res = requests.post(server_url, timeout=10, verify=False) 
        print(f"用户：{user_id}\n状态：{res}推送成功！\n推送方式：{name}\n")
    # eorr 
    else:
        print(f"用户：{user_id} 推送失败！\n")
    time.sleep(2) 


# 主程序
def main() :
    print("开始启动！")
    acquire_link_data() 


# 定时
def timing(times) :
    print(f"定时任务已启动：{times}秒")
    # 获取固定的开始时间
    start_time = datetime.strptime('2022-12-27 15:00:00', '%Y-%m-%d %H:%M:%S') 
    # 计算下一次运行的时间
    next_run_time = start_time + timedelta(seconds=times) 
    i = 1
    while True :
        # 获取当前时间
        current_time = datetime.now() 
        # 如果当前时间大于或等于下一次运行的时间，则运行任务
        if current_time >= next_run_time :
            time_tuple = time.localtime(time.time()) 
            nowtime = "{}年{}月{}日".format(time_tuple[0], time_tuple[1], time_tuple[2]) 
            #
            main() 
            print(f"{nowtime} 第{i}次执行")
            i += 1 
            # 计算下一次运行的时间
            next_run_time = current_time + timedelta(seconds=times) 
        else :
            # 否则，暂停程序
            time.sleep(1) 


if __name__ == '__main__' :
    # 定时任务
    timing(30) 
    # acquireuserdata() 
    # acquire_link_data()