Python链家二手房数据分析

最新推荐文章于 2024-11-04 23:18:04 发布

原创最新推荐文章于 2024-11-04 23:18:04 发布 · 2.1k 阅读

2 ·

CC 4.0 BY-SA版权

本文为博主原创文章，未经博主允许不得转载。

【个人成长】专栏收录该内容

7 篇文章

订阅专栏

本文通过爬虫技术从链家网抓取武汉东湖高新区的二手房数据，使用Python进行数据清洗、分析及可视化，展示了房价分布情况，并利用多项式回归预测了房价趋势。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from pylab import mpl
import pandas as pd
import numpy as np
import redis
import requests
import re
import time

total = []

url = 'https://wh.lianjia.com/ershoufang/donghugaoxin/pg'
HOST = 'localhost'
PORT = 6379
DB = 0
rds = redis.Redis(host=HOST, port=PORT, db=DB)

headers = {
    'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3; .NET4.0C; .NET4.0E)',
    'Accept': 'image/webp,image/*,*/*;q=0.8',
    'Referer': 'https://wh.lianjia.com/ershoufang/donghugaoxin/pg/',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive'}



'''
解决pyplot中中文不能输出的问题
'''
def set_ch():
    mpl.rcParams['font.sans-serif'] = ['FangSong']      # 指定默认字体
    mpl.rcParams['axes.unicode_minus'] = False          # 解决保存图像是负号'-'显示为方块的问题


def get_one_page(i):
    html = requests.get(url=url + str(i) + '/', headers=headers).content     # 获取一个网页
    soup = BeautifulSoup(html, 'lxml')
    items = soup.select('body > div.content > div.leftContent > ul > li ')
    for item in items:
        house = {
            'title': item.select('div.info.clear > div.title > a')[0].text,
            'area': item.select('div.info.clear > div.address > div')[0].text,
            'totalPrice': item.select('div.info.clear > div.priceInfo > div.totalPrice > span')[0].text,
            'price': re.compile('(\d+)').findall(item.select('div.info.clear > div.priceInfo > div.unitPrice > span')[0].text)[0],
            'follower': re.compile('(\d+)').findall(item.select('div.info.clear > div.followInfo')[0].text)[0],
            'watched': re.compile('(\d+)').findall(item.select('div.info.clear > div.followInfo')[0].text)[1],
            'UploadTime': re.compile('(\d+)').findall(item.select('div.info.clear > div.followInfo')[0].text)[-1],
        }
        rds.set(house['title'], house)
        total.append(house)
        print(house)
        # print(item.select('div.info.clear > div.title > a').text())
        # body > div.content > div.leftContent > ul > li:nth-child(1) > div.info.clear > div.title > a
        # print(item.select(''))

def data_plot(data, title=''):
    plt.title(title)
    plt.ylabel(u"所占比例")
    plt.xlabel(u'每平单价')
    plt.grid()
    n, bins, patches = plt.hist(data, bins=400, color='darkgreen', normed=True) #n: y轴上的值, bins: x轴上的值
    plt.plot(bins[0:400], n, '.')
    # print("n:",n)
    # print("bins:",bins)
    # print("patches:",patches)
    xData = np.array(bins[0:400]).reshape(-1, 1)
    yData = np.array(n).reshape(-1, 1)
    model = make_pipeline(PolynomialFeatures(10), Ridge())
    model.fit(xData, yData)
    #regr.fit(xData, yData)
    xPred = np.arange(10000, 90000, 5).reshape(-1, 1)
    yPred = model.predict(xPred)
    print(yPred)
    # for i in range(0, len(xPred)):
    #     res = regr.predict(xPred[:i])
    #     print(res)
    #     yPred.append(res)
    plt.plot(xPred, yPred, '--', color='red')
    # y = mlab.normpdf(bins, 18000, 4600)
    # plt.plot(bins, y, '--')
    plt.show()

def load_draw():
    data = np.load('guangzhou.npy')
    nnum = len(data)
    index = np.arange(nnum)
    # clf = mixture.GMM(n_components=2, covariance_type='full')
    # clf.fit(data)
    draw_data(data)

def draw_data(data):
    date = time.strftime('%Y/%m/%d', time.localtime(time.time()))
    data_plot(data, u"东湖高新区二手房价格分布." + date)


def main():
    for i in range(1, 60):
        get_one_page(i)
        time.sleep(0.5)

    # 绘制条状图
    prices = []
    for item in total:
        prices.append(int(item['price']))
    prices.sort()
    nnum = len(prices)
    index = np.arange(nnum)
    # 保存数据到文件中，避免反复的爬取数据
    # np.save('guangzhou.npy', prices)
    # draw_data()
    # print(total)

if __name__ == '__main__':
    set_ch()
    # load_draw()
    main()