思路很简单就是先分类,爬取分类的数据,然后根据分类数据爬取每个种类下的列表数据,然后根据列表数据一个一个爬取详情数据
废话不多说,直接上代码
今天要倒霉的网站是农业图谱网
# -*- coding: utf-8 -*-
import requests
import pymysql
from bs4 import BeautifulSoup # 用来解析网页
import uuid
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 '
'Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='zhang', charset='utf8')
cur = conn.cursor()
print("连接成功")
a = []
# 获取所有品种的名字,存放集合
a_resp = requests.get(f"http://tupu.zgny.com.cn/list_nzwbch.aspx", headers=headers)
a_page_one = BeautifulSoup(a_resp.content, "html.parser")
a_dd = a_page_one.find('div', class_='bigzhans_list_right_txt').find_all('a')
for ss in a_dd:
a.append(ss['href'].split('/')[1].split('_')[2])
b = []
# 获取所有品种的名字,存放集合
b_resp = requests.get(f"http://tupu.zgny.com.cn/list_jibingfz.aspx", headers=headers)
b_page_one = BeautifulSoup(b_resp.content, "html.parser")
b_dd = b_page_one.find('div', class_='bigzhans_list_right_txt').find_all('a')
for ss in b_dd:
b.append(ss['href'].split('/')[1].split('_')[2])
print(a)
print(b)
for zz in a:
for i in range(1, 15):
resp1 = requests.get(f"http://tupu.zgny.com.cn/Page_{i}_NodeId_nzwbch_js_{zz}.shtml", headers=headers)
page_two = BeautifulSoup(resp1.content, "html.parser")
dd = page_two.find('ul', class_='home-list3ccc').find_all('li')
# 如果第二页没数据,就不要爬后面的数据了
if not dd:
break
for ss in dd:
papaer_id = str(uuid.uuid1())
sUrl = ss.find('a')['href']
resp2 = requests.get(sUrl, headers=headers)
page_three = BeautifulSoup(resp2.content, "html.parser")
# 获取标题
title = page_three.find('div', class_='conLeft').find('h1').text
# 获取来源
source = page_three.find('div', class_='conLeft').find('p').text.split(' ')[0]
# 获取时间
timet = page_three.find('div', class_='conLeft').find('p').text[12:22]
# 内容
content = page_three.find('div', class_='wenZi_02').text.strip()
# 类型
p_type = "病虫害诊断"
sql = "insert into knowledge(id,title,source,timet,content,p_type,url) VALUES (%s,%s,%s,%s,%s,%s,%s)"
cur.execute(sql, (papaer_id, title, source, timet, content, p_type, sUrl))
print("病虫害诊断SQL正在执行第{}页执行完毕".format(i))
conn.commit()
time.sleep(1) # 防止服务器蹦了,间隔一秒钟
print("病虫害诊断{}已完成".format(zz))
for zz in b:
for i in range(1, 15):
resp1 = requests.get(f"http://tupu.zgny.com.cn/Page_{i}_NodeId_jibingfztp_js_{zz}/List.shtml", headers=headers)
page_two = BeautifulSoup(resp1.content, "html.parser")
dd = page_two.find('ul', class_='home-list3ccc').find_all('li')
if not dd:
break
for ss in dd:
papaer_id = str(uuid.uuid1())
sUrl = ss.find('a')['href']
resp2 = requests.get(sUrl, headers=headers)
page_three = BeautifulSoup(resp2.content, "html.parser")
# 获取标题
title = page_three.find('div', class_='conLeft').find('h1').text
# 获取来源
source = page_three.find('div', class_='conLeft').find('p').text.split(' ')[0]
# 获取时间
timet = page_three.find('div', class_='conLeft').find('p').text[12:22]
# 内容
content = page_three.find('div', class_='wenZi_02').text.strip()
# 类型
p_type = "检疫防治"
sql = "insert into knowledge(id,title,source,timet,content,p_type,url) VALUES (%s,%s,%s,%s,%s,%s,%s)"
cur.execute(sql, (papaer_id, title, source, timet, content, p_type, sUrl))
print("检疫防治SQL正在执行第{}页执行完毕".format(i))
conn.commit()
time.sleep(1) # 防止服务器蹦了,间隔一秒钟
print("检疫防治{}已完成".format(zz))
cur.close()
conn.close()