爬虫——获取研招网学校信息

这段代码演示了如何使用Python爬虫从研招网获取特定专业的招生信息,包括招生单位、考试方式、学院、专业等详细数据,并将信息保存到txt文件中。请注意避免频繁运行以免对服务器造成压力。
部署运行你感兴趣的模型镜像

本文主要描述了爬虫在研招网上的使用,请使用者不要频繁去运行代码,对服务器造成压力,否则后果自负。同时希望该代码可以帮助正在考研学子们可以筛选出自己喜欢的学校。

1.首先登陆研招网信息通过专业代码查询研招网上的专业代码。

 

 

 2.更改代码中学校的代码

 

 

 3.运行程序,代码附在下方。运行结束后会自动生成一个文本txt,你想要的信息就在里面。

import requestsfrom lxml import etreeimport redata ={"ssdm":"","dwmc":"","mldm":"zyxw","mlmc":"","yjxkdm":"","zymc":"","xxfs":"","pageno":""}class UniversityInfo:    def __init__(self):        # 招生单位、考试方式、院系所、专业、学习方式、研究方向、指导老师、拟招人数、备注、政治、外语、业务课一、业务课二        self.EnrollmentUnit = ""        self.ExaminationMethod = ""        self.CollegesDepartments = ""        self.Major = ""        self.learningStyle = ""        self.ResearchDirection = ""        self.Instructor = ""        self.Number = ""        self.Remarks = ""        self.Politics = ""        self.English = ""        self.BusinessClass1 = ""        self.BusinessClass2 = ""class YanZhaoWang:    def __init__(self,code):        self.url = "https://y 安徽党政培训 www.tjganxun.cn z.chsi.com.cn/zsml/queryAction.do?ssdm&dwmc&mldm=zyxw&mlmc&yjxkdm={}&zymc&xxfs&pageno={}".format(code)        self.data = data        self.data["yjxkdm"]=code        self.page = 1        self.schoolUrl = []        self.facultyUrl = []        self.schoolInfo = []    def GetFacultyUrl(self):        for i in self.schoolUrl:            response = requests.post("https://yz.chsi.com.cn"+i).text            response = etree.HTML(response)            url = response.xpath('//table/tbody//td[8]/a/@href')            for i in url:                print(i)                self.facultyUrl.append(i)    def GetSchoolInfo(self):        for i in self.facultyUrl:            response = requests.post("https://yz.chsi.com.cn/"+i).text            response = etree.HTML(response)            schoolinfo = UniversityInfo()            schoolinfo.EnrollmentUnit = response.xpath('//table[@class="zsml-condition"]/tbody/tr[1]/td[2]/text()')[0]            schoolinfo.ExaminationMethod = response.xpath('//table[@class="zsml-condition"]/tbody/tr[1]/td[4]/text()')[0]            schoolinfo.CollegesDepartments = response.xpath('//table[@class="zsml-condition"]/tbody/tr[2]/td[2]/text()')[0]            schoolinfo.Major = response.xpath('//table[@class="zsml-condition"]/tbody/tr[2]/td[4]/text()')[0]            schoolinfo.learningStyle = response.xpath('//table[@class="zsml-condition"]/tbody/tr[3]/td[2]/text()')[0]            schoolinfo.ResearchDirection = response.xpath('//table[@class="zsml-condition"]/tbody/tr[3]/td[4]/text()')[0]            schoolinfo.Instructor = response.xpath('//table[@class="zsml-condition"]/tbody/tr[4]/td[2]/text()')            schoolinfo.Number = response.xpath('//table[@class="zsml-condition"]/tbody/tr[4]/td[4]/text()')[0]            schoolinfo.Remarks = response.xpath('//table[@class="zsml-condition"]/tbody/tr[5]/text()')[0]            table = response.xpath('//tbody[@class="zsml-res-items"]')            for i in table:                schoolinfo.Politics = i.xpath('tr/td[1]/text()')[0]                schoolinfo.English = i.xpath('tr/td[2]/text()')[0]                schoolinfo.BusinessClass1 = i.xpath('tr/td[3]/text()')[0]                schoolinfo.BusinessClass1 = i.xpath('tr/td[4]/text()')[0]                self.schoolInfo.append(schoolinfo)    def GetSchoolUrl(self,url):        response = requests.post(url).text        response = etree.HTML(response)        page = response.xpath('//li[@class="lip lip-last"]/a/@onclick')        url = response.xpath('//*[@id="form3"]/a/@href')        for i in url:            self.schoolUrl.append(i)        print(page)        if page!=[]:            self.page += 1            self.GetSchoolUrl(self.url.format(self.page))    def WriteSchool(self):        with open("./text.txt",'a',encoding="utf-8")as f:            for i in self.schoolInfo:                print(i.EnrollmentUnit)                f.write(self.Tostring(i.EnrollmentUnit)+"	"+self.Tostring(i.ExaminationMethod)+"	"+self.Tostring(i.CollegesDepartments)+"	"+self.Tostring(i.Major)+"	"+self.Tostring(i.learningStyle)+"	"+self.Tostring(i.ResearchDirection)+"	"+self.Tostring(i.Instructor)+"	"+self.Tostring(i.Number)+"	"+self.Tostring(i.Remarks)+"	"+self.Tostring(i.Politics)+"	"+self.Tostring(i.English)+"	"+self.Tostring(i.BusinessClass1)+'	'+self.Tostring(i.BusinessClass2)+'
')    def Tostring(self,str):        str ="".join(str)        return re.sub("[	
]+","",str)    def run(self):        self.GetSchoolUrl(self.url.format(self.page))        self.GetFacultyUrl()        self.GetSchoolInfo()        self.WriteSchool()if __name__ == '__main__':    # 括号内填专业代码    yzw = YanZhaoWang('1055')    yzw.run()

  

代码详细解释

代码运行过程中需要requests、lxml、re库

学校介绍是有UniversityInfo类以字典形式储存的

YanZhaoWang类主要负责实现代码的主要逻辑

 

您可能感兴趣的与本文相关的镜像

Python3.10

Python3.10

Conda
Python

Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值