链家深圳二手房房价数据分析

最新推荐文章于 2021-08-05 07:58:00 发布

走马走马

最新推荐文章于 2021-08-05 07:58:00 发布

阅读量2k

点赞数 5

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.youkuaiyun.com/weixin_48232848/article/details/118615199

版权

python 专栏收录该内容

10 篇文章

订阅专栏

本文通过爬取链家数据，展示了深圳各区二手房房价的雷达图、饼状图、多维散点图和折线图，揭示房价走势和区域特点。数据分析师使用Python爬虫技术抓取链家数据，并用Pyecharts绘制了房价平均值、最高值、数量及单价分布，帮助读者理解市场动态。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

文章目录

链家深圳二手房房价数据分析

链家深圳二手房房价数据分析

1. 链家数据爬取源码

import csv
import pandas as pd
import requests
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 '
                  'Safari/537.36',
    'Host': 'sz.lianjia.com',
    'Referer': 'https://sz.lianjia.com/ershoufang/'
}


def getUrl():
    area_list = ['yantianqu', 'luohuqu', 'futianqu', 'nanshanqu', 'baoanqu', 'longgangqu', 'longhuaqu', 'guangmingqu',
                 'pingshanqu', 'dapengxinqu']
    area_name = ['盐田区', '罗湖区', '福田区', '南山区', '宝安区', '龙岗区', '龙华区', '光明区', '坪山区', '大鹏新区']
    max1, min1 = 0, 0 
    max2, min2 = 0, 0
    ytq, lhq, ftq, nsq, baq, lgq, lhq, gmq, psq, dpxq = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    # first_url = 'https://sz.lianjia.com/ershoufang/pg1/'
    for j in area_name:
        for i in range(1, 101):
            url = 'https://sz.lianjia.com/ershoufang/%s/pg%s/' % (area_list[area_name.index(j)], str(i))
            getResponse(url, j)
            print('%s 第%d页数据获取完成！' % (j, i))
            if j == '盐田区':
                ytq += 1
            elif j == '罗湖区':
                lhq += 1
            elif j == '福田区':
                ftq += 1
            elif j == '南山区':
                nsq += 1
            elif j == '宝安区':
                baq += 1
            elif j == '龙岗区':
                lgq += 1
            elif j == '龙华区':
                lhq += 1
            elif j == '光明区':
                gmq += 1
            elif j == '坪山区':
                psq += 1
            elif j == '大鹏新区':
                dpxq += 1
    print(f'{ytq}, {lhq}, {ftq}, {nsq}, {baq}, {lgq}, {lhq}, {gmq}, {psq}, {dpxq}')
    # if getResponse(url, area_name) == -1:
    #     print(f'{j}数据获取完毕！')
    #     break


# 获取数据
def getResponse(url, area_name):
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        return
    else:
        res = res.text
        root = etree.HTML(res)
        length = len(root.xpath('//*[@id="content"]/div[1]/ul/li'))
        house_list = []
        for i in range(length):
            # 获取房屋的标题、位置、简介、总价、单价、跟踪信息等
            house_info = {"houseArea": area_name,
                          "houseTitle": root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[1]/a/text()')[i],
                          "housePos": root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[1]/text()')[
                                          i] + '- ' + \
                                      root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[2]/div/a[2]/text()')[i],
                          "houseInfo": root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[3]/div/text()')[i],
                          "totalPrice": root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()')[
                              i],
                          "unitPrice": root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[6]/div[2]/span/text()')[
                              i],
                          "followInfo": root.xpath('//*[@id="content"]/div[1]/ul/li/div[1]/div[4]/text()')[i]}
            house_list.append(house_info)
        write_to_file(house_list)
    # return house_list


# 写入文件
def write_to_file(content):
    # ‘a’追加模式，‘utf_8_sig’格式到处csv不乱码
    with open('深圳二手房.csv', 'a', encoding='utf_8_sig', newline='') as f:
        fieldnames = ['houseTitle', 'houseArea', 'housePos', 'houseInfo', 'totalPrice', 'unitPrice', 'followInfo']
        # 利用csv包的DictWriter函数将字典格式数据存储到csv文件中
        for i in content:
            w = csv.DictWriter(f, fieldnames=fieldnames)
            w.writerow(i)


# 读取文件
def readfile():
    df = pd.read_csv("./深圳二手房.csv", encoding="utf-8")
    print(df.columns)
    print(df['totalPrice'])


if __name__ == '__main__':
    getUrl()
    # readfile()

2. 雷达图的绘制

2.1 源码

import pandas as pd
import pyecharts.options as opts
from pyecharts.charts import Radar
import csv


def ReadMaxAndAverage():
    df = pd.read_csv('深圳二手房.csv', encoding='utf-8')
    # 给文件添加列名
    df.columns = ['houseTitle', 'houseArea', 'housePos', 'houseInfo', 'totalPrice', 'unitPrice', 'followInfo']
    df['totalPrice'] = df['totalPrice'].str.replace('万', '').astype("float")
    df['unitPrice'] = df['unitPrice'].str.replace('单价', '').str.replace('元/平米', '').astype("float")
    # 总价最大值
    tempmax = df.groupby(['houseArea'])['totalPrice'].max().reset_index()
    tempmax = [(row["houseArea"], round(row["totalPrice"], 1)) for _, row in tempmax.iterrows()]
    # 总价平均值
    tempaverage = df.groupby(['houseArea'])['totalPrice'].mean().reset_index()
    tempaverage = [(row["houseArea"], round(row["totalPrice"], 1)) for _, row in tempaverage.iterrows()]
    # 单间平均值
    tempunit = df.groupby(['houseArea'])['unitPrice'].mean().reset_index()
    tempunit = [(row["houseArea"], round(row["unitPrice"], 1)) for _, row in tempunit.iterrows()]
    # 单价最大值
    tempunitmax = df.groupby(['houseArea'])['unitPrice'].max().reset_index()
    tempunitmax = [(row["houseArea"], round(row["unitPrice"], 1)) for _, row in tempunitmax.iterrows()]

    # print(tempmax)
    # print(tempaverage)
    # print(tempunit)
    # print(tempunitmax)
    return tempmax, tempaverage, tempunit, tempunitmax


def readData():
    area = ['盐田区', '罗湖区', '福田区', '南山区', '宝安区', '龙岗区', '龙华区', '光明区', '坪山区', '大鹏新区']
    # 读取数据存放的容器
    count1, count2, count3, count4, count5, count6, count7, count8, count9, count10 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    with open("深圳二手房.csv", "r", encoding="utf-8")as file:
        csvreader = csv.reader(file)
        for item in csvreader:
            # 排除不在星级选项的数据
            if item[1] in area:
                if item[1] == "盐田区":
                    count1 += 1
                elif item[1] == "罗湖区":
                    count2 += 1
                elif item[1] == "福田区":
                    count3 += 1
                elif item[1] == "南山区":
                    count4 += 1
                elif item[1] == "宝安区":
                    count5 += 1
                elif item[1] == "龙岗区":
                    count6 += 1
                elif item[1] == "龙华区":
                    count7 += 1
                elif item[1] == "光明区":
                    count8 += 1
                elif item[1] == "坪山区":
                    count9 += 1
                elif item[1] == "大鹏新区":
                    count10 += 1
        # print(count1, count2, count3, count4, count5, count6, count7, count8, count9, count10)  # 1723 3000

        return count1, count2, count3, count4, count5, count6, count7, count8, count9, count10


def Radar_Base() -> Radar:
    count1, count2, count3, count4, count5, count6, count7, count8, count9, count10 = readData()
    temp, tempaverage, tempunit, tempunitmax = ReadMaxAndAverage()

    # ['盐田区', '罗湖区', '福田区', '南山区', '宝安区', '龙岗区', '龙华区', '光明区', '坪山区', '大鹏新区']
    v1 = [[tempaverage[0][1], temp[0][1], count1, tempunit[5][1], tempunitmax[5][1]]]
    v2 = [[tempaverage[1][1], temp[1][1], count2, tempunit[7][1], tempunitmax[7][1]]]
    v3 = [[tempaverage[2][1], temp[2][1], count3, tempunit[6][1], tempunitmax[6][1]]]
    v4 = [[tempaverage[3][1], temp[3][1], count4, tempunit[1][1], tempunitmax[1][1]]]
    v5 = [[tempaverage[4][1], temp[4][1], count5, tempunit[4][1], tempunitmax[4][1]]]
    v6 = [[tempaverage[5][1], temp[5][1], count6, tempunit[9][1], tempunitmax[9][1]]]
    v7 = [[tempaverage[6][1], temp[6][1], count7, tempunit[8][1], tempunitmax[8][1]]]
    v8 = [[tempaverage[7][1], temp[7][1], count8, tempunit[0][1], tempunitmax[0][1]]]
    v9 = [[tempaverage[8][1], temp[8][1], count9, tempunit[2][1], tempunitmax[2][1]]]
    v10 = [[tempaverage[9][1], temp[9][1], count10, tempunit[3][1], tempunitmax[3][1]]]


    # 链式写法
    c = (
        Radar()
        # 配置雷达图的边的类项
        .add_schema(
            schema=[
                opts.RadarIndicatorItem(name="平均房价", max_=2000),
                opts.RadarIndicatorItem(name="最高房价", max_=12000),
                opts.RadarIndicatorItem(name="区二手房数量", max_=6000),
                opts.RadarIndicatorItem(name="平均单价", max_=120000),
                opts.RadarIndicatorItem(name="最高单价", max_=210000),
            ]
        )
        .add("盐田区", v1, color="pink", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="pink"),)
        .add("罗湖区", v2, color="blue", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="blue"))
        .add("福田区", v3, color="Cyan", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="Cyan"))
        .add("南山区", v4, color="Auqamarin", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="Auqamarin"))
        .add("宝安区", v5, color="Green", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="Green"))
        .add("龙岗区", v6, color="red", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="red"))
        .add("龙华区", v7, color="yellow", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="yellow"))
        .add("光明区", v8, color="orange", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="orange"))
        .add("坪山区", v9, color="Tomato", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="Tomato"))
        .add("大鹏新区", v10, color="Salmon", areastyle_opts=opts.AreaStyleOpts(opacity=0.5, color="Salmon"))
        # 系列配置
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(title_opts=opts.TitleOpts(title="深圳二手房对比"), legend_opts=opts.LegendOpts(pos_left="80%"))
    )
    return c


Radar_Base().render("深圳二手房雷达图.html")

2.2 雷达图效果图

在这里插入图片描述

3. 饼状图的绘制

3.1 源代码

from pyecharts import options as opts
from pyecharts.charts import Pie


def Pie_Base():
    # 商品售卖比列 火车
    v1 = ['盐田区', '罗湖区', '福田区', '南山区', '宝安区', '龙岗区', '龙华区', '光明区', '坪山区', '大鹏新区']
    # 深圳各区二手房数量 1265 5406 5598 5488 4206 11055 4089 486 958 343  总38894
    # 各区二手房占比深圳二手房比例
    v2 = [3.3, 14, 14.4, 14, 10.8, 28, 11, 1, 2.5, 1]

    c = (
        Pie()
        .add("", [list(z) for z in zip(v1, v2)])
        # 玫瑰图 --- 列表循环处理
        .set_global_opts(title_opts=opts.TitleOpts(title="深圳市二手房各区占比"), legend_opts=opts.LegendOpts(pos_left="80%"))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}:{c}%"))

    )
    return c


Pie_Base().render("饼状图.html")

3.2 饼状图效果图

在这里插入图片描述

4. 多维散点图

4.1 源码

from pyecharts.charts import Scatter
from pyecharts import options as opts
import pandas as pd
from pyecharts.commons.utils import JsCode


def readFile():
    df = pd.read_csv('深圳二手房.csv', encoding='utf-8')
    # 给文件添加列名
    df.columns = ['houseTitle', 'houseArea', 'housePos', 'houseInfo', 'totalPrice', 'unitPrice', 'followInfo']
    df['totalPrice'] = df['totalPrice'].str.replace('万', '').astype("float")
    temp = df.groupby(['houseArea'])['totalPrice'].mean().reset_index()
    temp = [(row["houseArea"], round(row["totalPrice"], 1)) for _, row in temp.iterrows()]
    print(temp)
    return temp


def Scatter_Base():
    temp = readFile()
    df = pd.DataFrame(
        {
         "平均房价": [x[1] for x in temp],
         "小区": [x[0] for x in temp]
         }
    )
    # 数据排序
    df.sort_values("平均房价", inplace=True, ascending=True)
    c = (
        Scatter()
        .add_xaxis(df.平均房价.values.tolist())
        # 传入两个数据链进行组合  通过这个组合返回js回调函数
        .add_yaxis("平均房价", df[["平均房价", "小区"]].values.tolist(),
                   label_opts=opts.LabelOpts(
                       formatter=JsCode(
                           # 自定义的js代码 返回一个自定义的标签选项
                           "function(params){return params.value[2];}"
                       )
                   )
                )
        .set_global_opts(
            title_opts=opts.TitleOpts(title="深圳二手房平均房价多维散点图"),
            xaxis_opts=opts.AxisOpts(
                type_="value",  # 设置数值类型 连续型
                min_=300.0
            ),
        )
    )
    return c


Scatter_Base().render("01 - 深圳二手房平均房价多维散点图.html")

4.2 多维散点图效果图

在这里插入图片描述

5. 玫瑰图

5.1 源码

from pyecharts.charts import Pie
from pyecharts import options as opts


def Pie_RoseType():
    # 按照对应的年度和季度的数值 通过玫瑰图进行显示
    c = (
        Pie()
        .add("",
            [list(z) for z in zip(["201{}年/{}季度".format(y, z)
                                        for y in range(3)
                                        for z in range(1, 5)],
                                       [4.88, 5.88, 6.88, 7.88, 5.88, 7.88, 9.88, 8.88, 9.88, 5.88, 4.88, 6.88])],
                 # 内径和外径的设置
                 radius=["0%", "75%"],
                 rosetype="radius",
                 label_opts=opts.LabelOpts(is_show=True),
                 )
        .set_global_opts(title_opts=opts.TitleOpts(title="年季度玫瑰图显示"), legend_opts=opts.LegendOpts(pos_left="80%"))
    )
    return c


Pie_RoseType().render("02 - 玫瑰图.html")

5.2 玫瑰图效果图

在这里插入图片描述

6.折线图

6.1源码

from pyecharts import options as opts
from pyecharts.charts import Line
import pandas as pd


def readFile():
    df = pd.read_csv('深圳二手房.csv', encoding='utf-8')
    # 给文件添加列名
    df.columns = ['houseTitle', 'houseArea', 'housePos', 'houseInfo', 'totalPrice', 'unitPrice', 'followInfo']
    df['totalPrice'] = df['totalPrice'].str.replace('万', '').astype("float")
    temp = df.groupby(['houseArea'])['totalPrice'].mean().reset_index()
    temp = [(row["houseArea"], round(row["totalPrice"], 1)) for _, row in temp.iterrows()]
    print(temp)
    return temp


def LineOne():
    temp = readFile()
    line = Line()

    line.add_xaxis(
        [x[0] for x in temp]
    )
    line.add_yaxis("平均房价",
                   [x[1] for x in temp]
                   # 设置y轴数据列项
    )

    line.set_global_opts(
        # x轴的列项名 倾斜 40°显示
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(rotate=45),
        ),
        yaxis_opts=opts.AxisOpts(name="平均房价（单位/万）"),  # y轴的设置名称
        title_opts=opts.TitleOpts(title="深圳二手房平均房价折线图")
    )

    line.render("03 - 深圳二手房平均房价折线图.html")


LineOne()