农业图谱网_分类-列表-详情三级爬取(源码)_一蓑烟雨任平生

最新推荐文章于 2024-05-31 21:43:53 发布

原创最新推荐文章于 2024-05-31 21:43:53 发布

· 475 阅读

4 ·

版权

python 同时被 2 个专栏收录

46 篇文章

订阅专栏

爬虫

35 篇文章

订阅专栏

思路很简单就是先分类,爬取分类的数据,然后根据分类数据爬取每个种类下的列表数据,然后根据列表数据一个一个爬取详情数据

废话不多说,直接上代码

今天要倒霉的网站是农业图谱网

# -*- coding: utf-8 -*-
import requests
import pymysql
from bs4 import BeautifulSoup  # 用来解析网页
import uuid
import time
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 '
                  'Safari/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.8'
}
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='zhang', charset='utf8')
cur = conn.cursor()
print("连接成功")
a = []
# 获取所有品种的名字,存放集合
a_resp = requests.get(f"http://tupu.zgny.com.cn/list_nzwbch.aspx", headers=headers)
a_page_one = BeautifulSoup(a_resp.content, "html.parser")
a_dd = a_page_one.find('div', class_='bigzhans_list_right_txt').find_all('a')
for ss in a_dd:
    a.append(ss['href'].split('/')[1].split('_')[2])

b = []
# 获取所有品种的名字,存放集合
b_resp = requests.get(f"http://tupu.zgny.com.cn/list_jibingfz.aspx", headers=headers)
b_page_one = BeautifulSoup(b_resp.content, "html.parser")
b_dd = b_page_one.find('div', class_='bigzhans_list_right_txt').find_all('a')
for ss in b_dd:
    b.append(ss['href'].split('/')[1].split('_')[2])

print(a)
print(b)

for zz in a:
    for i in range(1, 15):
        resp1 = requests.get(f"http://tupu.zgny.com.cn/Page_{i}_NodeId_nzwbch_js_{zz}.shtml", headers=headers)
        page_two = BeautifulSoup(resp1.content, "html.parser")
        dd = page_two.find('ul', class_='home-list3ccc').find_all('li')
        # 如果第二页没数据,就不要爬后面的数据了
        if not dd:
            break
        for ss in dd:
            papaer_id = str(uuid.uuid1())
            sUrl = ss.find('a')['href']
            resp2 = requests.get(sUrl, headers=headers)
            page_three = BeautifulSoup(resp2.content, "html.parser")
            # 获取标题
            title = page_three.find('div', class_='conLeft').find('h1').text
            # 获取来源
            source = page_three.find('div', class_='conLeft').find('p').text.split(' ')[0]
            # 获取时间
            timet = page_three.find('div', class_='conLeft').find('p').text[12:22]
            # 内容
            content = page_three.find('div', class_='wenZi_02').text.strip()
            # 类型
            p_type = "病虫害诊断"
            sql = "insert into knowledge(id,title,source,timet,content,p_type,url) VALUES (%s,%s,%s,%s,%s,%s,%s)"
            cur.execute(sql, (papaer_id, title, source, timet, content, p_type, sUrl))
        print("病虫害诊断SQL正在执行第{}页执行完毕".format(i))
        conn.commit()
        time.sleep(1)  # 防止服务器蹦了,间隔一秒钟
    print("病虫害诊断{}已完成".format(zz))
for zz in b:
    for i in range(1, 15):
        resp1 = requests.get(f"http://tupu.zgny.com.cn/Page_{i}_NodeId_jibingfztp_js_{zz}/List.shtml", headers=headers)
        page_two = BeautifulSoup(resp1.content, "html.parser")
        dd = page_two.find('ul', class_='home-list3ccc').find_all('li')
        if not dd:
            break
        for ss in dd:
            papaer_id = str(uuid.uuid1())
            sUrl = ss.find('a')['href']
            resp2 = requests.get(sUrl, headers=headers)
            page_three = BeautifulSoup(resp2.content, "html.parser")
            # 获取标题
            title = page_three.find('div', class_='conLeft').find('h1').text
            # 获取来源
            source = page_three.find('div', class_='conLeft').find('p').text.split(' ')[0]
            # 获取时间
            timet = page_three.find('div', class_='conLeft').find('p').text[12:22]
            # 内容
            content = page_three.find('div', class_='wenZi_02').text.strip()
            # 类型
            p_type = "检疫防治"
            sql = "insert into knowledge(id,title,source,timet,content,p_type,url) VALUES (%s,%s,%s,%s,%s,%s,%s)"
            cur.execute(sql, (papaer_id, title, source, timet, content, p_type, sUrl))
        print("检疫防治SQL正在执行第{}页执行完毕".format(i))
        conn.commit()
        time.sleep(1)  # 防止服务器蹦了,间隔一秒钟
    print("检疫防治{}已完成".format(zz))
cur.close()
conn.close()

在这里插入图片描述