爬取赶集网二手房信息（详细代码）

最新推荐文章于 2021-05-12 14:14:56 发布

原创最新推荐文章于 2021-05-12 14:14:56 发布 · 884 阅读

2 ·

CC 4.0 BY-SA版权

文章标签：

#爬虫 #赶集网

python 专栏收录该内容

6 篇文章

订阅专栏

基本没有反爬，直接上代码

import requests
from lxml import etree
import re
import xlwt
import time

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
    'Host':'nb.ganji.com'
}


# 创建excel表格
f = xlwt.Workbook(encoding='utf_8')
sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
sheet01.write(0, 0, '名称')
sheet01.write(0, 1, '厅室')
sheet01.write(0, 2, '面积')
sheet01.write(0, 3, '朝向')
sheet01.write(0, 4, '楼层')
sheet01.write(0, 5, '小区')
sheet01.write(0, 6, '联系人')
sheet01.write(0, 7, '价格')
sheet01.write(0, 8, '单价')
num = 1

for x in range(1,71):
    url = 'http://nb.ganji.com/ershoufang/pn%d/' % x
    response = requests.get(url, headers)
    result = response.text

    html = etree.HTML(result, etree.HTMLParser())
    # 获取所有二手房的div标签   
    divs = html.xpath("//div[@class='f-list js-tips-list']/div[contains(@class,'ershoufang-list')]")
    # 遍历每个标签，拿到需要的数据
    for div in divs:
        title = div.xpath(".//dd[contains(@class,'title')]/a/@title")[0]   
        info = div.xpath(".//dd[contains(@class,'size')]//text()")
        rooms = info[1]  
        area = info[4]   
        orientation = info[7]   
        floor = info[10]  
        address = "".join(div.xpath(".//dd[contains(@class,'address')][1]//text()"))
        address = re.sub("\s", "", address)
        host = "".join(div.xpath(".//dd[contains(@class,'address')][2]//text()"))
        host = re.sub("\s", "", host)
        price = "".join(div.xpath(".//dd[contains(@class,'info')]/div[@class='price']/span/text()"))    # 
        unit_price = div.xpath(".//dd[contains(@class,'info')]/div[@class='time']/text()")[0]
        sheet01.write(num, 0, title)
        sheet01.write(num, 1, rooms)
        sheet01.write(num, 2, area)
        sheet01.write(num, 3, orientation)
        sheet01.write(num, 4, floor)
        sheet01.write(num, 5, address)
        sheet01.write(num, 6, host)
        sheet01.write(num, 7, price)
        sheet01.write(num, 8, unit_price)
        num = num + 1
    time.sleep(1)  #  设置延时


f.save("二手房"+'.xls')