新农网_同页面,多种编码,破译编码抓取(源码)

这个反解密搞了我一下午,网页编码一会UTF-8,一会ISO-8859-1,一会GBK,网上的说法不一,都没卵用,最后自己摸索出来的,对全文解码GBK,当遇到别的进行忽略,decode(‘gbk’, ‘ignore’)

废话不多说,直接上代码

今天要倒霉的网站是新农网

# -*- coding: utf-8 -*-
import requests
import pymysql
from bs4 import BeautifulSoup  # 用来解析网页
import uuid
import time

url = "http://www.xinnong.net"

farmType = ["t20354", "t20349", "t20346"]
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 '
                  'Safari/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.8'
}
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='zhang', charset='utf8')
cur = conn.cursor()
print("连接成功")

for tt in farmType:
    for i in range(1, 2):  # 爬取第一页到第3页的数据
        resp = requests.get(f"https://www.nongjx.com/subweb/{tt}/tech.html", headers=headers)

        page_one = BeautifulSoup(resp.content.decode('gbk'), "html.parser")

        dd = page_one.find('div', class_='newslist').find_all('li')
        for ss in dd:
            sUrl = url + ss.find('a')['href']
            # 打开二级网页进行爬取
            rp = requests.get(sUrl, headers=headers)
            page_two = BeautifulSoup(rp.content.decode('gbk', 'ignore'), "html.parser")
            papaer_id = str(uuid.uuid1())
            # 标题
            title = page_two.find('div', class_='arctit').find('h1').text
            # 来源
            source = page_two.find('div', class_='arctit').find('div', class_='arcinfo').text.split('来源')[1][1:]
            # 时间
            timet = page_two.find('div', class_='arctit').find('div', class_='arcinfo').text[5:15]
            # 摘要
            desc = page_two.find('div', class_='arcdes').text.strip('摘要:')
            # 内容
            content = page_two.find('div', id='article').text.strip()
            if tt == 'shucai' or tt == 'shiyongjun':
                p_type = "温室大棚"
            elif tt == 'xuqin' or tt == 'shuichan':
                p_type = "畜牧水产"
            else:
                p_type = "大田林业"
            sql = "insert into knowledge(id,title,source,timet,descrip,content,p_type,url) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"
            cur.execute(sql, (papaer_id, title, source, timet, desc, content, p_type, sUrl))
        print("SQL正在执行第{}页执行完毕".format(i))
        conn.commit()
        time.sleep(1)  # 防止服务器蹦了,间隔一秒钟
cur.close()
conn.close()