Python------爬虫实战

爬虫概述:爬取数据
用户获取网络数据的方式:
浏览器提交请求->下载网页代码->解析/渲染成页面

模拟浏览器发送请求->下载网页代码->只提取有用的数据->存放于数据库或文件中

网络爬虫------向网站发起请求,获取资源后分析并提取有用数据的程序

反爬机制:
1.非浏览器模式
2.IP限制 (防止多次请求,火车票抢票都是多个服务器跳转进行抢票)
3.验证码机制
4.封账号

爬虫操作流程
python+request库+excel

1.模拟浏览器发送请求
2.下载网页代码
3.只提取有用的数据
4.存放于数据库或文件中

#1-模拟浏览器发送请求
import requests
import re

def get_webPages():
    web_url = "https://search.51job.com/list/150200%252C010000%252C020000%252C030200,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=ml?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
    }
    resp = requests.get(web_url, headers=headers)
    # 2-解析页面内容
    resp.encoding = 'gbk'  # 防止乱码
    pages = re.findall('<span class="td">共(.*?)页,到第</span>',resp.text)[0]
    return int(pages)
print(get_webPages())

#--------------存储初始化---------------------
import xlwt
#1--创建一个excel文件
workBook = xlwt.Workbook(encoding = 'utf-8')
#2-在文件对象里创建一个sheet
WorkSheet = workBook.add_sheet('51job.res')
workBook.save('e:\\51job.xls')    #存放到具体的磁盘路径,之前创建,都是保存在内存中
#3-创建列名
colName = ['职位名','公司名','工作地点','薪资','发布时间']
for col in range(0,len(colName)):
    WorkSheet.write(0,col,colName[col])     #(行,列,内容)

line = 1
#--------------------------------获取所有的页--------------------------------
for one in  range(1,get_webPages()+1):
    web_url = f"https://search.51job.com/list/150200%252C010000%252C020000%252C030200,000000,0000,00,9,99,%25E8%2587%25AA%25E5%258A%25A8%25E5%258C%2596%25E6%25B5%258B%25E8%25AF%2595%25E5%25B7%25A5%25E7%25A8%258B%25E5%25B8%2588,2,{one}.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=ml?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
    }
    resp = requests.get(web_url,headers = headers)
    #2-解析页面内容
    resp.encoding = 'gbk'  #防止乱码
    print(resp.text)   #返回是 字符串
    #print(resp.content)  #返回是byte
    #print(resp.request.headders)   #获取请求头
    #print(resp.request.body)       #获取请求自由体
    #print(resp.headders)           #获取响应头


    #3-提取有用数据
    info = re.findall('<div class="el">(.*?)</div>',resp.text,re.S)
    #对每一个招聘信息提取数据

    for one in info:
        #1.岗位名称
        temp = re.findall('<a target="_blank" title="(.*?)" href=',one,re.S)
        JobName = temp[0]
        WorkSheet.write(line, 0,JobName)
        #print(JobName)
        #2.公司名称
        CompanyName = temp[1]
        WorkSheet.write(line, 1, CompanyName)
        #print(CompanyName)
        #3.公司地址
        address = re.findall('<span class="t3">(.*?)</span>',one,re.S)[0]   #要带[0],因为返回的是一个列表
        WorkSheet.write(line, 2,address)
        #print(address)
        #4.薪资
        salary = re.findall('<span class="t4">(.*?)</span>',one,re.S)[0]
        WorkSheet.write(line, 3, salary)
        #print(salary)
        #5.发布时间
        time = re.findall('<span class="t5">(.*?)</span>',one,re.S)[0]
        WorkSheet.write(line, 4, time)
        #print(time)
        line += 1
        print(JobName,CompanyName,address,salary,time)
    #print(info[0])


    '''
    <div class="el">
            <p class="t1 ">
                <em class="check" name="delivery_em" οnclick="checkboxClick(this)"></em>
                <input class="checkbox" type="checkbox" name="delivery_jobid" value="110950720" jt="0" style="display:none" />
                <span>
                    <a target="_blank" title="自动化测试开发高级工程师" href="https://jobs.51job.com/hefei/110950720.html?s=01&t=0"  οnmοusedοwn="">
                        自动化测试开发高级工程师                </a>
                </span>
                                                                        </p>
            <span class="t2"><a target="_blank" title="联宝(合肥)电子科技有限公司" href="https://jobs.51job.com/all/co2725159.html">联宝(合肥)电子科技有限公司</a></span>
            <span class="t3">合肥</span>
            <span class="t4">1.5-2万/月</span>
            <span class="t5">03-26</span>
        </div>
        '''

#4-存储数据----excel
workBook.save('e:\\51job.xls')
#WorkSheet.write()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值