import requests,os from lxml import etree from def2_mysql import MysqlHelper myhelper =MysqlHelper() sql = 'INSERT INTO tengxun (one_title, one_didian, one_leibie, one_renshu, one_zhize, tow_yaoqiu) VALUES' \ ' (%s, %s, %s, %s, %s, %s)' # 构建代理 proxy = { 'http' : 'http://alice:123456@120.78.166.84:6666', 'https' : 'http://alice:123456@120.78.166.84:6666' } headers={ "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36", } def A(fenlei,yema): for pa in range(0,yema,10): print(yema) url = 'https://hr.tencent.com/position.php?keywords={}&lid=0&tid=0&start={}'.format(fenlei,pa) response=requests.get(url,headers=headers,proxies=proxy) # 返回bytes html_bytes=response.content # # xpath html_ele = etree.HTML(html_bytes) # 获取每一个URL列表 li_list = html_ele.xpath('//table[@class="tablelist"]/tr/td/a/@href') # print(li_list) print('正在保存第' + str(int(pa/10 +1)) + '页.............') B(li_list) # 再次发起请求获取每一个主题的 def B(li_list): # 循环主题列表 for li_ele in li_list: print(li_ele) # 循环取列表每个url 循环后返回一个列表 last_url = 'https://hr.tencent.com/' + li_ele response = requests.get(last_url, headers=headers, proxies=proxy) # 返回html img_html_str = response.text img_ele = etree.HTML(img_html_str) # # 获取整体信息 img_sum = img_ele.xpath('//table[@class="tablelist textl"]') # 循环访问主题的每一条信息 for j in img_sum: one_title = j.xpath('./tr[1]/td/text()')[0] print(one_title) one_didian = j.xpath('./tr[2]/td[1]/text()')[0] one_leibie = j.xpath('./tr[2]/td[2]/text()')[0] one_renshu = j.xpath('./tr[2]/td[3]/text()')[0] one_zhize = j.xpath('./tr[3]/td/ul/li/text()')[0] one_yaoqiu = j.xpath('./tr[4]/td/ul/li/text()') tow_yaoqiu = ' '.join(one_yaoqiu) # print(tow_yaoqiu) data = (one_title, one_didian, one_leibie, one_renshu, one_zhize, tow_yaoqiu) myhelper.execute_modify_sql(sql, data) if __name__=='__main__': fenglei = input('请输入您要搜索的分类:') yema=int(input('请输入您要几页:'))*10 cc=A(fenglei,yema)