xpath爬取安居客数据并保存数据库

本文介绍了一个使用Python编写的安居客租房信息爬虫程序。该程序能够自动抓取北京地区安居客网站上的房源信息,并将其存储到MySQL数据库中。具体抓取的信息包括房源的价格、地铁线、朝向、租房方式等详细数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import requests
from lxml import etree
import pymysql

from fake_useragent import UserAgent

class AnJuKe:

  	def __init__(self):
        self.count = 1    #计数
        self.get_html()     #一定要记得调用函数
        
	#有网址访问对应的html
    def get_html(self):
        page = 1
        while True:
            print("================第{}页================".format(page))
            base_url = "https://bj.zu.anjuke.com/fangyuan/p{}/".format(page)

            headers = {

                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "zh-CN,zh;q=0.9",
                "cookie": "aQQ_ajkguid=07DC0869-C342-00F4-79D0-1588D78134CB; ctid=14; 58tj_uuid=fb71c4df-cb3b-4bf7-9766-8bfde756cdce; als=0; sessid=8546D53E-CCF8-0879-A17A-47AF9391F465; lps=http%3A%2F%2Fbj.zu.anjuke.com%2F%7C; twe=2; ajk_member_captcha=2228a65d8b0d1b1e4ba3dec8fb1e43a8; ajk_member_id=156056657; ajk_member_name=U15532524315938; ajk_member_key=def42d9eebe69aaeb09b1d231cf34dc7; ajk_member_time=1584788400; aQQ_ajkauthinfos=caoYy9ejUIIJbpJjMRQkqdltKpy6kzmmh3LR%2FBdkHDrBDpLDt06Ex4ZtXUQ0YcIy3W6Obs4wPjiByDeDKdw88fV4F6o; lui=156056657%3A1; wmda_uuid=8dc25ee12165129434ec90979cc987cc; wmda_new_uuid=1; wmda_visited_projects=%3B6289197098934; wmda_session_id_6289197098934=1553402902639-1f0543a2-5573-df61; __xsptplusUT_8=1; init_refer=; new_uv=3; new_session=0; __xsptplus8=8.3.1553402903.1553402937.5%234%7C%7C%7C%7C%7C%23%23XEYaSeMn8-kfuWzB99gWsriGgiUFhaVp%23",
                "referer": "https://bj.zu.anjuke.com/",
                "upgrade-insecure-requests": "1",
                "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",

            }

            response = requests.get(base_url, headers=headers)
            html = response.text
            # print(html)
            html_xml = etree.HTML(html)
            self.parse_htmt(html_xml)
            page += 1
            # if "下一页" not in html:
            if page > 50:
                break
		#分析并启用xpath获取数据
	    def parse_htmt(self, html):
	        # 缩小范围
	        li_xml_list = html.xpath(".//div[contains(@class,'itemmod')]")
	        # print(li_xml_list)
	        for li_xml in li_xml_list:
	            anju_dict = {}
	            # 价格
	            price = li_xml.xpath('./div[@class="zu-side"]/p/strong/text()')
	            price = price[0] if price else ""
	            price = price + "元/月"
	            # print(price)
	            # 地铁线
	            train = li_xml.xpath('.//span[@class="cls-3"]/text()')
	            train = train[0] if train else ""
	            # print(train)
	            # 朝向
	            direction = li_xml.xpath('.//span[@class="cls-2"]/text()')
	            direction = direction[0] if direction else ""
	            # print(direction)
	            # 租房方式
	            rent_way = li_xml.xpath('.//span[@class="cls-1"]/text()')
	            rent_way = rent_way[0] if rent_way else ""
	            # print(r_way)
	            # 地址
	            # addr=li_xml.xpath('.//address')
	            # addr=addr[0] if addr else ""
	            # print(addr)
	
	            # 城区
	            chengqu = li_xml.xpath('./div/address/a/text()')
	            chengqu = chengqu[0] if chengqu else ''
	            # print(chengqu)
	            # 经纪人
	            agent = li_xml.xpath('./div/p/text()')
	            agent = agent[3] if agent else ""
	            agent = agent.strip()
	
	            # print(agent)
	            # 楼层
	            # 楼层
	            floor = li_xml.xpath('./div/p/text()')
	            floor1 = floor[2] if floor else ""
	            floor = floor1.split("/")[0] + "层"
	            # print(floor)
	
	            # 总楼层
	            total_floor = li_xml.xpath('./div/p/text()')
	            total_floor1 = total_floor[2] if total_floor else ""
	            total_floor = total_floor1.split("/")[1]
	            # print(total_floor)
	            # 面积
	            area = li_xml.xpath('./div/p/text()')
	            area = area[1] if area else ""
	            # print(area)
	            # 室厅
	            Room_hall = li_xml.xpath('./div/p/text()')
	            Room_hall = Room_hall[0] if Room_hall else ""
	            room_hall = Room_hall.strip()
	            # print(room_hall)
	            # 标题
	            title = li_xml.xpath("./div/h3/a/text()")
	            title = title[0] if title else ""
	            # print(title)
	            # 图片
	            imgaes = li_xml.xpath('./a/img/@src')
	            imgaes = imgaes[0] if imgaes  else ""
	            # print(imgaes)
	            anju_dict["price"] = price
	            anju_dict["train"] = train
	            anju_dict["direction"] = direction
	            anju_dict["rent_way"] = rent_way
	            anju_dict["chengqu"] = chengqu
	            anju_dict["agent"] = agent
	            anju_dict["floor"] = floor
	            anju_dict["total_floor"] = total_floor
	            anju_dict["area"] = area
	            anju_dict["title"] = title
	            anju_dict["room_hall"] = room_hall
	            anju_dict["imgaes"] = imgaes
	            self.save_data(self.count, price, train, direction, rent_way, chengqu, agent, floor, total_floor, area,
	                           title, room_hall, imgaes) #调用储存数据库函数
	            print(self.count, anju_dict)
	            self.count += 1
		#保存数据库
	    def save_data(self, count, price, train, direction, rent_way, chengqu, agent, floor, total_floor, area, title,
	                  room_hall, imgaes):
	        mycon = None
	        try:
	            mydb = pymysql.connect(host='localhost', user='root', password='111111', database='test')
	            mycon = mydb.cursor()
	            mycon.execute(
	                'create table if not EXISTS anjuke(id int(9) auto_increment primary key ,price VARCHAR(255) ,train varchar(30),direction VARCHAR(255),rent_way VARCHAR(255),chengqu VARCHAR(255),agent VARCHAR(255),floor VARCHAR(255),total_floor VARCHAR(255),area VARCHAR(255),title VARCHAR(255),room_hall VARCHAR(255),imgaes VARCHAR(255) )'
	            )
	            mycon.execute('select * from anjuke')
	            for i in mycon:
	                pass
	                # print(i)
	            # mycon.execute('insert into guazi VALUES (int(self.count),str(price),str(descw),str(year),str(mileage),str(images))')
	            sql = 'insert into anjuke(id,price,train,direction,rent_way,chengqu,agent,floor,total_floor,area,title,room_hall,imgaes)VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
	            val = [(int(count), str(price), str(train), str(direction), str(rent_way), str(chengqu), str(agent), str(floor),
	                    str(total_floor), str(area), str(title), str(room_hall), str(imgaes))]
	            mycon.executemany(sql, val)
	            print(mycon.rowcount, "记录插入成功")
	            mydb.commit()
	        except:
	
	            mydb.rollback()
	
	        finally:
	            if mycon:
	                mycon.close()
	            if mydb:
	                mydb.close()

if name == ‘main’:
AnJuKe()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值