# coding:utf-8 ''' 使用urllib2模块进行数据采集,使用XPath进行数据筛选,使用随机免费代理 ''' # 引入需要的模块 import urllib2 from lxml import etree import random # 免费代理列表 proxy_list = [{"http":"116.8.83.3:8118"}, {"http":"116.8.83.3:8118"}, {"http":"113.89.59.161:8118"}, {"http":"113.67.183.196:8118"}, {"http":"180.155.135.224:31425"}, {"http":"123.161.153.238:22593"}] # 随机选择代理 proxy_ip = random.choice(proxy_list) # 请输入要爬取的页数 nums=input("请输入要爬取的页数:") for num in range(1,nums+1): url = 'http://search.51job.com/list/170200,000000,0000,00,9,99,%2B,2,+'+str(num)+'.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' my_header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } # 定义请求对象 request=urllib2.Request(url,headers=my_header) # # 自定义代理操作对象 # proxy_hanlder=urllib2.ProxyHandler(proxy_ip) # # # 自定义opener对象 # proxy_opener=urllib2.build_opener(proxy_hanlder) # # # 发送请求响应 # response=proxy_opener.open(request) # 发送请求 response=urllib2.urlopen(request) response.encoding='gbk' html=etree.HTML(response.read()) name_list=html.xpath("//div[@id='resultList']/div[@class='el']/p[@class='t1 ']/span") names_list=[] for name in name_list: new_name= name.xpath('string(.)') names_list.append(new_name) company_list=html.xpath("//div[@class='el']/span[@class='t2']") companys_list=[] for company in company_list: new_company = company.xpath('string(.)').strip() companys_list.append(new_company) month_list=html.xpath("/html/body/div[@class='dw_wp']/div[@id='resultList']/div[@class='el']/span[@class='t3']") months_list=[] for month in month_list: new_month = month.xpath('string(.)').strip() months_list.append(new_month) time_list=html.xpath("/html/body/div[@class='dw_wp']/div[@id='resultList']/div[@class='el']/span[@class='t5']") times_list=[] for time in time_list: new_time = time.xpath('string(.)').strip() times_list.append(new_time) f=open('qcwy.txt',"a") f.write("第%s页内容如下:"%num+"\r\n") f.write("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@2222") for i in range(0,len(names_list)): info=names_list[i]+"|"+companys_list[i]+"|"+months_list[i]+"|"+times_list[i]+"\r\n" print info f.write(info.encode('utf-8')) f.write("第%s页爬取完毕"%num+"\r\n") f.write("#############################################################################################"+"\r\n") f.close() print ("爬取完毕")
Python爬虫——4.5urllib2和xpath爬取前程无忧网招聘信息
最新推荐文章于 2024-09-02 16:34:27 发布