案例需求:
-
- 进入房产信息详情页,获取房产的户型、建筑面积、朝向、楼层、装修等等基本信息
代码实现:
- 导入需要用到的库
import os
import re
import threading
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
import pandas as pd
- 获取信息方法,房产信息大致分为楼房、电梯房和独栋,因此部分信息xpath无法完全获取,使用正则爬取可以获取完整信息,网页未上传的信息,我们将其设为未知,保证信息打印工整
def get_house_info(url):
driver = webdriver.Edge()
driver.get(url)
i = 1
while True:
try:
infomation = driver.find_elements(By.XPATH, '//dd/h4/a/span')
for info in infomation:
info.click()
driver.switch_to.window(driver.window_handles[-1])
driver.implicitly_wait(20)
time.sleep(1)
html = driver.page_source
text = etree.HTML(html)
# 户型
room = re.findall(r"pageConfig.room='(.*?)';", html)
hall = re.findall(r"pageConfig.hall='(.*?)';", html)
if room == [] or hall == []:
house_type = '未知'
else:
house_type = room[0] + '室' + hall[0] + '厅'
# 朝向
toward = re.findall(r"pageConfig.forward='(.*?)';", html)
if toward == [] or toward is None:
toward = '未知'
else:
toward = toward[0]
# 建筑面积
floor_space = re.findall(r"area: '(.*?)'", html)
if floor_space == [] or floor_space is None:
floor_space = '未知'
else:
floor_space = floor_space[0]
# 楼层
floor = re.findall(r'"vwe.housefloor": "(.*?)"', html)
if floor == [] or floor is None:
floor = '未知'
else:
floor = floor[0]
# 装修
renovation = re.findall(r'"vwe.fixstatus": "(.*?)",', html)
if renovation == [] or renovation is None:
renovation = '未知'
else:
renovation = renovation[0]
# 建筑年代
year_built = re.findall(r'"vwe.createtime": "(.*?)"', html)
if year_built == [] or year_built is None:
year_built = '未知'
else:
year_built = year_built[0]
# 建筑类别
building_type = re.findall(r'"vwe.buildcategory": "(.*?)",', html)
if building_type == [] or building_type is None:
building_type = '其他'
else:
building_type = building_type[0]
# 电梯
if building_type == '独栋':
lift = '无'
else:
lift = re.findall(r'<span class="lab">有无电梯</span><span class="rcont">(.*?) </span>', html)
if lift == [] or lift is None:
lift = '无'
else:
lift = lift[0]
# 产权性质
property_right = re.findall(
r'产权性质</span><span class="rcont"><a href=.*? target="_blank" class="link_rk">(.*?)</a>', html)
if property_right == [] or property_right is None:
property_right = '未知'
else:
property_right = property_right[0]
# 住宅类别
house_category = re.findall(r'"vwe.purpose": "(.*?)",', html)
if house_category == [] or house_category is None:
house_category = '其他'
else:
house_category = house_category[0]
# 建筑结构
structure1 = re.findall(
r'<span class="lab">厅结构</span><span class="rcont"><a href=.*? target="_blank" class="link_rk">(.*?)</a></span>',
html)
structure2 = re.findall(
r'<span class="lab">建筑结构</span><span class="rcont"><a href=.*? target="_blank" class="link_rk">(.*?)</a></span>',
html)
if structure1 == [] or structure1 is None:
if structure2 == [] or structure2 is None:
structure = '其他'
else:
structure = structure2[0]
else:
if structure1 == [] or structure1 is None:
structure = '其他'
else:
structure = structure1[0]
# 区域
area = text.xpath('//div[@id="address"]')
if area == [] or area is None:
area = '未知'
else:
try:
for a in area:
area = a.xpath('string(.)').replace('\n', '').replace(' ', '')
except Exception as e:
area = '未知'
# 总价
total_price = re.findall(r'"vwe.totalprice": (.*?),', html)
if total_price == [] or total_price is None:
total_price = '未知'
else:
total_price = total_price[0]
# 单价
unit_price = re.findall(r'"vwe.unitprice": "(.*?)",', html)
if unit_price == [] or unit_price is None:
unit_price = '未知'
else:
unit_price = unit_price[0]
df = pd.DataFrame({'户型': [house_type], '建筑面积': [floor_space], '朝向': [toward], '楼层': [floor],
'装修': [renovation], '建筑年代': [year_built], '电梯': [lift],
'产权性质': [property_right], '住宅类别': [house_category], '建筑结构': [structure],
'建筑类别': [building_type], '区域': [area], '总价': [total_price],
'单价': [unit_price]})
if not os.path.exists('房天下.csv'):
df.to_csv('房天下.csv', mode='w', header=False, index=False)
else:
df.to_csv('房天下.csv', mode='a', header=False, index=False)
print('---保存成功---')
# 关闭当前窗口
driver.close()
driver.switch_to.window(driver.window_handles[0])
driver.find_element(By.LINK_TEXT, '下一页').click()
except Exception as e:
print(e)
i+=1
if i > 10:
break
- 由于数据过多,我们采用多线程模式爬取
- 启动线程和关闭线程
def start_thread(urls):
thread_list = []
for url in urls:
t = threading.Thread(target=get_house_info, args=(url,))
t.start()
thread_list.append(t)
print(f'{t.name}启动成功')
return thread_list
def stop_thread(thread_list):
for t in thread_list:
print(f'{t.name}等待结束')
t.join()
- 房产信息总共100页,我们观察网页url,发现url末尾是以i3**结尾,设置十个线程,每个线程爬取10页
if __name__ == '__main__':
# url = 'https://cs.esf.fang.com/house/i31/'
# get_house_info(url)
urls = []
for i in range(10):
url = f'https://cs.esf.fang.com/house/i3{1 + 10 * i}/'
urls.append(url)
thread_list = start_thread(urls)
stop_thread(thread_list)
print('---全部爬取完成---')