爬虫概述:爬取数据
用户获取网络数据的方式:
浏览器提交请求->下载网页代码->解析/渲染成页面
模拟浏览器发送请求->下载网页代码->只提取有用的数据->存放于数据库或文件中
网络爬虫------向网站发起请求,获取资源后分析并提取有用数据的程序
反爬机制:
1.非浏览器模式
2.IP限制 (防止多次请求,火车票抢票都是多个服务器跳转进行抢票)
3.验证码机制
4.封账号
爬虫操作流程
python+request库+excel
1.模拟浏览器发送请求
2.下载网页代码
3.只提取有用的数据
4.存放于数据库或文件中
#1-模拟浏览器发送请求
import requests
import re
def get_webPages():
web_url = "https://search.51job.com/list/150200%252C010000%252C020000%252C030200,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=ml?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
resp = requests.get(web_url, headers=headers)
# 2-解析页面内容
resp.encoding = 'gbk' # 防止乱码
pages = re.findall('<span class="td">共(.*?)页,到第</span>',resp.text)[0]
return int(pages)
print(get_webPages())
#--------------存储初始化---------------------
import xlwt
#1--创建一个excel文件
workBook = xlwt.Workbook(encoding = 'utf-8')
#2-在文件对象里创建一个sheet
WorkSheet = workBook.add_sheet('51job.res')
workBook.save('e:\\51job.xls') #存放到具体的磁盘路径,之前创建,都是保存在内存中
#3-创建列名
colName = ['职位名','公司名','工作地点','薪资','发布时间']
for col in range(0,len(colName)):
WorkSheet.write(0,col,colName[col]) #(行,列,内容)
line = 1
#--------------------------------获取所有的页--------------------------------
for one in range(1,get_webPages()+1):
web_url = f"https://search.51job.com/list/150200%252C010000%252C020000%252C030200,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,{one}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=ml?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
}
resp = requests.get(web_url,headers = headers)
#2-解析页面内容
resp.encoding = 'gbk' #防止乱码
print(resp.text) #返回是 字符串
#print(resp.content) #返回是byte
#print(resp.request.headders) #获取请求头
#print(resp.request.body) #获取请求自由体
#print(resp.headders) #获取响应头
#3-提取有用数据
info = re.findall('<div class="el">(.*?)</div>',resp.text,re.S)
#对每一个招聘信息提取数据
for one in info:
#1.岗位名称
temp = re.findall('<a target="_blank" title="(.*?)" href=',one,re.S)
JobName = temp[0]
WorkSheet.write(line, 0,JobName)
#print(JobName)
#2.公司名称
CompanyName = temp[1]
WorkSheet.write(line, 1, CompanyName)
#print(CompanyName)
#3.公司地址
address = re.findall('<span class="t3">(.*?)</span>',one,re.S)[0] #要带[0],因为返回的是一个列表
WorkSheet.write(line, 2,address)
#print(address)
#4.薪资
salary = re.findall('<span class="t4">(.*?)</span>',one,re.S)[0]
WorkSheet.write(line, 3, salary)
#print(salary)
#5.发布时间
time = re.findall('<span class="t5">(.*?)</span>',one,re.S)[0]
WorkSheet.write(line, 4, time)
#print(time)
line += 1
print(JobName,CompanyName,address,salary,time)
#print(info[0])
'''
<div class="el">
<p class="t1 ">
<em class="check" name="delivery_em" οnclick="checkboxClick(this)"></em>
<input class="checkbox" type="checkbox" name="delivery_jobid" value="110950720" jt="0" style="display:none" />
<span>
<a target="_blank" title="自动化测试开发高级工程师" href="https://jobs.51job.com/hefei/110950720.html?s=01&t=0" οnmοusedοwn="">
自动化测试开发高级工程师 </a>
</span>
</p>
<span class="t2"><a target="_blank" title="联宝(合肥)电子科技有限公司" href="https://jobs.51job.com/all/co2725159.html">联宝(合肥)电子科技有限公司</a></span>
<span class="t3">合肥</span>
<span class="t4">1.5-2万/月</span>
<span class="t5">03-26</span>
</div>
'''
#4-存储数据----excel
workBook.save('e:\\51job.xls')
#WorkSheet.write()