基本没有反爬,直接上代码
import requests
from lxml import etree
import re
import xlwt
import time
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Host':'nb.ganji.com'
}
# 创建excel表格
f = xlwt.Workbook(encoding='utf_8')
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet01.write(0, 0, '名称')
sheet01.write(0, 1, '厅室')
sheet01.write(0, 2, '面积')
sheet01.write(0, 3, '朝向')
sheet01.write(0, 4, '楼层')
sheet01.write(0, 5, '小区')
sheet01.write(0, 6, '联系人')
sheet01.write(0, 7, '价格')
sheet01.write(0, 8, '单价')
num = 1
for x in range(1,71):
url = 'http://nb.ganji.com/ershoufang/pn%d/' % x
response = requests.get(url, headers)
result = response.text
html = etree.HTML(result, etree.HTMLParser())
# 获取所有二手房的div标签
divs = html.xpath("//div[@class='f-list js-tips-list']/div[contains(@class,'ershoufang-list')]")
# 遍历每个标签,拿到需要的数据
for div in divs:
title = div.xpath(".//dd[contains(@class,'title')]/a/@title")[0]
info = div.xpath(".//dd[contains(@class,'size')]//text()")
rooms = info[1]
area = info[4]
orientation = info[7]
floor = info[10]
address = "".join(div.xpath(".//dd[contains(@class,'address')][1]//text()"))
address = re.sub("\s", "", address)
host = "".join(div.xpath(".//dd[contains(@class,'address')][2]//text()"))
host = re.sub("\s", "", host)
price = "".join(div.xpath(".//dd[contains(@class,'info')]/div[@class='price']/span/text()")) #
unit_price = div.xpath(".//dd[contains(@class,'info')]/div[@class='time']/text()")[0]
sheet01.write(num, 0, title)
sheet01.write(num, 1, rooms)
sheet01.write(num, 2, area)
sheet01.write(num, 3, orientation)
sheet01.write(num, 4, floor)
sheet01.write(num, 5, address)
sheet01.write(num, 6, host)
sheet01.write(num, 7, price)
sheet01.write(num, 8, unit_price)
num = num + 1
time.sleep(1) # 设置延时
f.save("二手房"+'.xls')