import requests
from lxml import etree
import pymysql
from fake_useragent import UserAgent
class AnJuKe:
def __init__(self):
self.count = 1 #计数
self.get_html() #一定要记得调用函数
#有网址访问对应的html
def get_html(self):
page = 1
while True:
print("================第{}页================".format(page))
base_url = "https://bj.zu.anjuke.com/fangyuan/p{}/".format(page)
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "aQQ_ajkguid=07DC0869-C342-00F4-79D0-1588D78134CB; ctid=14; 58tj_uuid=fb71c4df-cb3b-4bf7-9766-8bfde756cdce; als=0; sessid=8546D53E-CCF8-0879-A17A-47AF9391F465; lps=http%3A%2F%2Fbj.zu.anjuke.com%2F%7C; twe=2; ajk_member_captcha=2228a65d8b0d1b1e4ba3dec8fb1e43a8; ajk_member_id=156056657; ajk_member_name=U15532524315938; ajk_member_key=def42d9eebe69aaeb09b1d231cf34dc7; ajk_member_time=1584788400; aQQ_ajkauthinfos=caoYy9ejUIIJbpJjMRQkqdltKpy6kzmmh3LR%2FBdkHDrBDpLDt06Ex4ZtXUQ0YcIy3W6Obs4wPjiByDeDKdw88fV4F6o; lui=156056657%3A1; wmda_uuid=8dc25ee12165129434ec90979cc987cc; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; wmda_session_id_6289197098934=1553402902639-1f0543a2-5573-df61; __xsptplusUT_8=1; init_refer=; new_uv=3; new_session=0; __xsptplus8=8.3.1553402903.1553402937.5%234%7C%7C%7C%7C%7C%23%23XEYaSeMn8-kfuWzB99gWsriGgiUFhaVp%23",
"referer": "https://bj.zu.anjuke.com/",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
}
response = requests.get(base_url, headers=headers)
html = response.text
# print(html)
html_xml = etree.HTML(html)
self.parse_htmt(html_xml)
page += 1
# if "下一页" not in html:
if page > 50:
break
#分析并启用xpath获取数据
def parse_htmt(self, html):
# 缩小范围
li_xml_list = html.xpath(".//div[contains(@class,'itemmod')]")
# print(li_xml_list)
for li_xml in li_xml_list:
anju_dict = {}
# 价格
price = li_xml.xpath('./div[@class="zu-side"]/p/strong/text()')
price = price[0] if price else ""
price = price + "元/月"
# print(price)
# 地铁线
train = li_xml.xpath('.//span[@class="cls-3"]/text()')
train = train[0] if train else ""
# print(train)
# 朝向
direction = li_xml.xpath('.//span[@class="cls-2"]/text()')
direction = direction[0] if direction else ""
# print(direction)
# 租房方式
rent_way = li_xml.xpath('.//span[@class="cls-1"]/text()')
rent_way = rent_way[0] if rent_way else ""
# print(r_way)
# 地址
# addr=li_xml.xpath('.//address')
# addr=addr[0] if addr else ""
# print(addr)
# 城区
chengqu = li_xml.xpath('./div/address/a/text()')
chengqu = chengqu[0] if chengqu else ''
# print(chengqu)
# 经纪人
agent = li_xml.xpath('./div/p/text()')
agent = agent[3] if agent else ""
agent = agent.strip()
# print(agent)
# 楼层
# 楼层
floor = li_xml.xpath('./div/p/text()')
floor1 = floor[2] if floor else ""
floor = floor1.split("/")[0] + "层"
# print(floor)
# 总楼层
total_floor = li_xml.xpath('./div/p/text()')
total_floor1 = total_floor[2] if total_floor else ""
total_floor = total_floor1.split("/")[1]
# print(total_floor)
# 面积
area = li_xml.xpath('./div/p/text()')
area = area[1] if area else ""
# print(area)
# 室厅
Room_hall = li_xml.xpath('./div/p/text()')
Room_hall = Room_hall[0] if Room_hall else ""
room_hall = Room_hall.strip()
# print(room_hall)
# 标题
title = li_xml.xpath("./div/h3/a/text()")
title = title[0] if title else ""
# print(title)
# 图片
imgaes = li_xml.xpath('./a/img/@src')
imgaes = imgaes[0] if imgaes else ""
# print(imgaes)
anju_dict["price"] = price
anju_dict["train"] = train
anju_dict["direction"] = direction
anju_dict["rent_way"] = rent_way
anju_dict["chengqu"] = chengqu
anju_dict["agent"] = agent
anju_dict["floor"] = floor
anju_dict["total_floor"] = total_floor
anju_dict["area"] = area
anju_dict["title"] = title
anju_dict["room_hall"] = room_hall
anju_dict["imgaes"] = imgaes
self.save_data(self.count, price, train, direction, rent_way, chengqu, agent, floor, total_floor, area,
title, room_hall, imgaes) #调用储存数据库函数
print(self.count, anju_dict)
self.count += 1
#保存数据库
def save_data(self, count, price, train, direction, rent_way, chengqu, agent, floor, total_floor, area, title,
room_hall, imgaes):
mycon = None
try:
mydb = pymysql.connect(host='localhost', user='root', password='111111', database='test')
mycon = mydb.cursor()
mycon.execute(
'create table if not EXISTS anjuke(id int(9) auto_increment primary key ,price VARCHAR(255) ,train varchar(30),direction VARCHAR(255),rent_way VARCHAR(255),chengqu VARCHAR(255),agent VARCHAR(255),floor VARCHAR(255),total_floor VARCHAR(255),area VARCHAR(255),title VARCHAR(255),room_hall VARCHAR(255),imgaes VARCHAR(255) )'
)
mycon.execute('select * from anjuke')
for i in mycon:
pass
# print(i)
# mycon.execute('insert into guazi VALUES (int(self.count),str(price),str(descw),str(year),str(mileage),str(images))')
sql = 'insert into anjuke(id,price,train,direction,rent_way,chengqu,agent,floor,total_floor,area,title,room_hall,imgaes)VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
val = [(int(count), str(price), str(train), str(direction), str(rent_way), str(chengqu), str(agent), str(floor),
str(total_floor), str(area), str(title), str(room_hall), str(imgaes))]
mycon.executemany(sql, val)
print(mycon.rowcount, "记录插入成功")
mydb.commit()
except:
mydb.rollback()
finally:
if mycon:
mycon.close()
if mydb:
mydb.close()
if name == ‘main’:
AnJuKe()