import requests
from lxml import etree
import time
from mysql import MysqlHelper
# 实例化mysql类
mc = MysqlHelper()
def wujia(url,headers):
for i in range(1,21):
fullurl = url.format(i)
response = requests.get(fullurl,headers=headers)
html = response.text
html = etree.HTML(html)
ul_list = html.xpath('//ul[@class="pList"]/li')
for i in ul_list:
dizhi = i.xpath('./div[2]/div/p[2]/text()')
if dizhi:
dizhi = dizhi[0].replace('·','').strip()
print(dizhi)
else:
print('无地址')
room_data = i.xpath('./div[2]/div[1]/p[1]/text()')[0].replace(' ','').replace('·','')
print(room_data)
money = i.xpath('./div[2]/div[1]/div//strong/text()')[0]
money = int(money)
print(money)
#写sql语句,存入数据库
sql = 'insert into 5i5j(dizhi,room_data,money) values ("%s","%s","%s")'
data = (dizhi,room_data,money)
#调用mysql类中的执行方法
mc.execute_modify_sql(sql,data)
#设置休息时间
# time.sleep(3)
#执行函数和传参
if __name__ == '__main__':
url = 'https://bj.5i5j.com/zufang/huilongguan/n{}/'
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
# 'Accept-Encoding': 'gzip, deflate, br',
# 'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': '_Jo0OQK=1349C4397FAF634DC28494B7372F7EE6A6BAFA7626819FB7027758023F7044ABBF04A8994B1C24AC6546C9F8AD8FBB6099D9577A4343988ABD03B1C034CCCF5A512DE8682CA7D10E3B498FB9E3C853EFEE298FB9E3C853EFEE215D8BEE34E43E5C0GJ1Z1IQ==; _ga=GA1.2.1741003594.1534567087; _gid=GA1.2.429105984.1534567087; yfx_c_g_u_id_10000001=_ck18081812380714557549321883736; PHPSESSID=a1eros861f9teh5n4r6lv6f30j; Hm_lvt_94ed3d23572054a86ed341d64b267ec6=1534567088,1534572065; zufang_BROWSES=41290006; domain=bj; yfx_f_l_v_t_10000001=f_t_1534567087448__r_t_1534567087448__v_t_1534591789846__r_c_0; Hm_lpvt_94ed3d23572054a86ed341d64b267ec6=1534591798',
'Host': 'bj.5i5j.com',
'Referer': 'https://bj.5i5j.com/zufang/huilongguan/n2/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
wujia(url,headers)