建议使用单进程爬虫,因为爬取研招网信息是数据处理花费的时间多,数据量大,而不是爬取数据花费的画,所以如果多进程,那么不同进程必须将数据放置在不同的文件中,之后将数据在汇总也要花费大量的时间,经过实际测试,多进程需要5·7天完成,单进程需要1·2天完成。
单进程见
https://blog.youkuaiyun.com/qq_40142391/article/details/118638120?spm=1001.2014.3001.5501
代码可以直接运行,需要一个包含学科代码的文件txt文件 ,该文件在网页https://yz.chsi.com.cn/zsml/pages/getZy.jsp
上,可以直接把他放在一个txt文件中(命名为subject.txt),放在项目文件夹内即可
程序会创建E:/yanzhaowang2.xlsx文件,存储运行结果,项目运行后得到的是研招网上所有的数据信息。
其结果示例如下
# coding:utf-8
import requests
from bs4 import BeautifulSoup
import multiprocessing
import openpyxl
import os
import time
# 采用多进程爬取数据处理数据,利用callback函数将得到的数据存储
# 系统的限制,对excel文件的修改,不能由多个进程同步修改,而且利用callback回调函数的执行也是线性的,并不能实现多进程存储(实际发现存储需要占用大量的时间资源)
# 因此,考虑一个城市存储在一个excel文件中
# 但是,一次测试跑了三天只是将所有数据分别放在一个excel文件中,并且可能会有内存中出的数据丢失,因此考虑每存一点数据就存到excel中
with open("subject.txt", "r", encoding="utf-8") as f:
content = f.read()
subject = eval(content) #获取科目信息
# #测试用例
# province = [{"mc":"内蒙古自治区","dm":"15"},{"mc":"辽宁省","dm":"21"},{"mc":"吉林省","dm":"22"},{"mc":"黑龙江省","dm":"23"}] #测试多进程
# province = [{"mc": "北京市", "dm": "11"}]
# subject = [{"mc": "资产评估", "dm": "0256"}]
workfolderurl = 'E:/yanzhao' #数据文件地址
noteurl = "E:/yanzhao/note.txt" #日志文件地址
NETWORK_STATUS = True # 判断状态变量
province = [{"mc": "北京市", "dm": "11"}, {"mc": "天津市", "dm": "12"}, {"mc": "河北省", "dm": "13"}, {"mc": "山西省", "dm": "14"}, {"mc":"内蒙古自治区","dm":"15"},{"mc":"辽宁省","dm":"21"},{"mc":"吉林省","dm":"22"},{"mc":"黑龙江省","dm":"23"},{"mc":"上海市","dm":"31"},{"mc":"江苏省","dm":"32"},{"mc":"浙江省","dm":"33"},{"mc":"安徽省","dm":"34"},{"mc":"福建省","dm":"35"},{"mc":"江西省","dm":"36"},{"mc":"山东省","dm":"37"},{"mc":"河南省","dm":"41"},{"mc":"湖北省","dm":"42"},{"mc":"湖南省","dm":"43"},{"mc":"广东省","dm":"44"},{"mc":"广西壮族自治区","dm":"45"},{"mc":"海南省","dm":"46"},{"mc":"重庆市","dm":"50"},{"mc":"四川省","dm":"51"},{"mc":"贵州省","dm":"52"},{"mc":"云南省","dm":"53"},{"mc":"西藏自治区","dm":"54"},{"mc":"陕西省","dm":"61"},{"mc":"甘肃省","dm":"62"},{"mc":"青海省","dm":"63"},{"mc":"宁夏回族自治区","dm":"64"},{"mc":"新疆维吾尔自治区","dm":"65"}]
def getschool(ssdm,yjxkdm):
# 由省市代码和一级学科代码确定的网页
soup = getpost(ssdm, '', yjxkdm, '')
listschool = []
# 先获取第一页的内容
for i in soup.find_all("td"):
text = i.find('a')
# 如果text不为空,输出
if text:
listschool.append(text.get_text()[7:])
# 如果有第二页,则循环从第二页开始获取
if len(soup.find_all(href='#')) > 1:
pagenum = int(soup.find_all(href='#')[-2].get_text())
#得到页码数量
for pagei in range(1, pagenum):
soup = getpost(ssdm, '', yjxkdm, str(pagei+1))
for i in soup.find_all("td"):
text = i.find('a')
# 如果text不为空,输出
if text:
listschool.append(text.get_text()[7:])
return listschool
def getmajor (ssdm,dwmc,yjxkdm):
# r = requests.post(url)
# soup = BeautifulSoup(r.content, "lxml")
soup = getpost(ssdm, dwmc, yjxkdm, '')
# print(ssdm+dwmc+yjxkdm)
listmajor = []
for i in soup.find_all("td", class_="ch-table-center"):
text = i.find('a')
# 如果text不为空,输出
if text:
if text.get_text() == '查看':
listmajor.append('https://yz.chsi.com.cn/' + text.get('href'))
# print(text.get('href'))
if len(soup.find_all(href='####')) > 1:
# print(soup.find_all(href='#')[-2].get_text())
pagenum = int(soup.find_all(href='####')[-2].get_text()) #得到页码数量
for pagei in range(1, pagenum):
soup = getpost(ssdm, dwmc, yjxkdm, str(pagei+1))
for i in soup.find_all("td", class_="ch-table-center"):
text = i.find('a')
# 如果text不为空,输出
if text:
if text.get_text() == '查看':
listmajor.append('https://yz.chsi.com.cn/' + text.get('href'))
# print(text.get('href'))
return listmajor
def getsubject (url, i, j, list):
r = requests.post(url)
soup = BeautifulSoup(r.content, "lxml")
# 获取招生条件
condition = soup.find_all(class_='zsml-summary')
# 招生单位
# print(condition[0].get_text())
# worksheet.write()
# # 考试方式
# print(condition[1].get_text())
# # 院系所
# print(condition[2].get_text())
# # 跨专业
# print(condition[3].get_text())
# # 专业
# print(condition[4].get_text())
# # 学习方式
# print(condition[5].get_text())
# # 研究方向
# print(condition[6].get_text())
# # 指导老师
# print(condition[7].get_text())
# # 拟招人数
# print(condition[8].get_text())
# 获取考试范围
results = soup.find_all(class_='zsml-res-items')
for nunmre in range(len(results)):
information = []
information.append(province[i]['mc'])
for numco in range(len(condition)):
# 插入该专业的招生条件信息
information.append(condition[numco].get_text())
# 插入该专业的考试范围
re = results[nunmre].find_all("td")
for numsuj in range(len(re)):
information.append(re[numsuj].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
print(str(len(province)) + ':' + str(i) + str(province[i]['mc']) + str(len(subject)) + ':' + str(j)+condition[2].get_text()+condition[4].get_text()+condition[6].get_text())
list.append(information)
return list
# for i in range(len(results)):
#
# re = results[i].find_all("td")
# # 政治
# print(re[0].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
# # 外语
# print(re[1].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
# # 业务科一
# print(re[2].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
# # 业务科二
# print(re[3].get_text().replace(' ', '').replace('见招生简章', '').replace('\n', '').replace('\r', ''))
def getpost (ssdm,dwmc,yjxkdm,pageno):
# 参数说明 ssdm 省市代码,dwmc 单位名称,yjxkdm 一级学科代码,pageno 页码
# 针对第二页的,无法直接获取链接,需要post请求pageno
url1 = 'https://yz.chsi.com.cn/zsml/queryAction.do' #在查询院校中,用此链接请求getschool
url2 = 'https://yz.chsi.com.cn/zsml/querySchAction.do' #在查询专业中,用此链接请求getmajor
if dwmc == '':
url = url1
else:
url = url2
header = {}
header['Accept'] = '*/*'
header['Accept-Encoding'] = 'gzip, deflate, br'
header['Accept-Language'] = 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7'
header['Connection'] = 'keep-alive'
header['Content-Length'] = '58'
header['Content-Type'] = 'application/x-www-form-urlencoded'
header['Host'] = 'yz.chsi.com.cn'
header['Origin'] = 'https://yz.chsi.com.cn'
header['Referer'] = 'https://yz.chsi.com.cn/zsml/queryAction.do?ssdm=11&dwmc=&mldm=&mlmc=&yjxkdm=0812&zymc=&xxfs='
header['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \
'(KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
header['X-Requested'] = 'XMLHttpRequest'
data = {'ssdm': ssdm, 'dwmc': dwmc, 'yjxkdm': yjxkdm, 'pageno': pageno}
#设置超时重发机制
try:
r = requests.post(url, headers=header, data=data)
if r.status_code == 200:
htmlcontent = r.content.decode('utf-8')
soup = BeautifulSoup(htmlcontent, 'lxml')
return soup
except requests.exceptions.Timeout:
global NETWORK_STATUS
NETWORK_STATUS = False # 请求超时改变状态
if NETWORK_STATUS == False:
'''请求超时'''
for i in range(1, 10):
print('请求超时,第%s次重复请求' % i)
r = requests.post(url, headers=header, data=data, timeout=5)
if r.status_code == 200:
htmlcontent = r.content.decode('utf-8')
soup = BeautifulSoup(htmlcontent, 'lxml')
return soup
return -1 # 当所有请求都失败,返回 -1 ,此时有极大的可能是网络问题或IP被封。
def multpro(i):
list = []
# num = 1 # 当前插入的行数
for j in range(len(subject)):
listschool = getschool(str(province[i]['dm']), str(subject[j]['dm'])) # 得到在这个城市开设这个专业的学校列表
for z in range(len(listschool)):
# 合成带学校学科的链接,可直接进入学校专业界面
# 获取专业范围地址集合
listmajor = getmajor(str(province[i]['dm']), str(listschool[z]), str(subject[j]['dm']))
for k in range(len(listmajor)):
# 进入专业范围内地址
list = getsubject(listmajor[k], i, j, list)
# print(str(len(province))+':'+str(i)+' '+str(len(subject))+':'+str(j))
with open(noteurl, "a") as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ')
f.write(str(len(province))+':'+str(i)+' '+str(len(subject))+':'+str(j)+'\n')
save(i, list)
def createxcel(workurl):
if(os.path.isfile(workurl)):
workbook = openpyxl.load_workbook(workurl)
else:
workbook = openpyxl.Workbook()
# 建立Excel文件,初始化标题
worksheet = workbook.active
worksheet.cell(1, 1, '所在省市')
worksheet.cell(1, 2, '招生单位')
worksheet.cell(1, 3, '考试方式')
worksheet.cell(1, 4, '院系所')
worksheet.cell(1, 5, '跨专业')
worksheet.cell(1, 6, '专业')
worksheet.cell(1, 7, '学习方式')
worksheet.cell(1, 8, '研究方向')
worksheet.cell(1, 9, '指导老师')
worksheet.cell(1, 10, '拟招人数')
worksheet.cell(1, 11, '政治')
worksheet.cell(1, 12, '外语')
worksheet.cell(1, 13, '业务课一')
worksheet.cell(1, 14, '业务课二')
workbook.save(workurl)
return workbook
def save(i, list):
# 将list的数据存在sheet中
# i = msg['i']
# list = msg['list']
workurl = workfolderurl + '/' + str(i) + '.xlsx'
workbook = createxcel(workurl) # 初始化Excel文件
worksheet = workbook.active
for infor in list:
worksheet.append(infor)
workbook.save(workurl)
with open(noteurl, "a") as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ')
f.write('保存' + province[i]['mc'] + '全部成功' + '\n')
def normalization():
# 将得到的excel文件的所有sheet合并到一个一个sheet中
normalizationurl = workfolderurl + '/' + 'normalization.xlsx'
# 读取已有的Excel文件
workbook = openpyxl.Workbook()
# workbook.remove(workbook['Sheet'])
ws = workbook.create_sheet('normalization', 0)
ws.cell(1, 1, '所在省市')
ws.cell(1, 2, '招生单位')
ws.cell(1, 3, '考试方式')
ws.cell(1, 4, '院系所')
ws.cell(1, 5, '跨专业')
ws.cell(1, 6, '专业')
ws.cell(1, 7, '学习方式')
ws.cell(1, 8, '研究方向')
ws.cell(1, 9, '指导老师')
ws.cell(1, 10, '拟招人数')
ws.cell(1, 11, '政治')
ws.cell(1, 12, '外语')
ws.cell(1, 13, '业务课一')
ws.cell(1, 14, '业务课二')
# 将第一个Excel文件的所有sheet合并到第二个Excel文件的一个sheet中
num = 2
for workexcel in os.listdir(workfolderurl):
workurl = workfolderurl + '/' + workexcel
sheet = openpyxl.load_workbook(workurl).active
for i in range(1, sheet.max_row):
for j in range(0, sheet.max_column):
ws.cell(row=num, column=j + 1).value = list(sheet.rows)[i][j].value
if (i % 100 == 0):
# print("已合并" + sheet.cell(2, 1) + ':' + str(i))
with open(noteurl, "a") as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ')
f.write("已合并" + sheet.cell(2, 1) + ':' + str(i) + '\n')
num = num + 1
workbook.save(normalizationurl)
# print("合并" + sheet.cell(2, 1) + "成功")
with open(noteurl, "a") as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ')
f.write("合并" + sheet.cell(2, 1) + '成功' + '\n')
workbook.save(normalizationurl)
if __name__ == '__main__':
if (len(os.listdir(workfolderurl)) != 0):
for file in os.listdir(workfolderurl):
os.remove(workfolderurl + '/' + file)
with open(noteurl, "wt") as f:
f.write(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' ')
f.write("开始记录\n") #日志文件,后期判断是否爬取所有内容
pool = multiprocessing.Pool(processes=7)
for i in range(len(province)):
p = pool.apply_async(func=multpro, args=(i, ))
pool.close()
pool.join()
normalization()