from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from pylab import mpl
import pandas as pd
import numpy as np
import redis
import requests
import re
import time
total = []
url = 'https://wh.lianjia.com/ershoufang/donghugaoxin/pg'
HOST = 'localhost'
PORT = 6379
DB = 0
rds = redis.Redis(host=HOST, port=PORT, db=DB)
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; InfoPath.3; .NET4.0C; .NET4.0E)',
'Accept': 'image/webp,image/*,*/*;q=0.8',
'Referer': 'https://wh.lianjia.com/ershoufang/donghugaoxin/pg/',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive'}
'''
解决pyplot中中文不能输出的问题
'''
def set_ch():
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
def get_one_page(i):
html = requests.get(url=url + str(i) + '/', headers=headers).content # 获取一个网页
soup = BeautifulSoup(html, 'lxml')
items = soup.select('body > div.content > div.leftContent > ul > li ')
for item in items:
house = {
'title': item.select('div.info.clear > div.title > a')[0].text,
'area': item.select('div.info.clear > div.address > div')[0].text,
'totalPrice': item.select('div.info.clear > div.priceInfo > div.totalPrice > span')[0].text,
'price': re.compile('(\d+)').findall(item.select('div.info.clear > div.priceInfo > div.unitPrice > span')[0].text)[0],
'follower': re.compile('(\d+)').findall(item.select('div.info.clear > div.followInfo')[0].text)[0],
'watched': re.compile('(\d+)').findall(item.select('div.info.clear > div.followInfo')[0].text)[1],
'UploadTime': re.compile('(\d+)').findall(item.select('div.info.clear > div.followInfo')[0].text)[-1],
}
rds.set(house['title'], house)
total.append(house)
print(house)
# print(item.select('div.info.clear > div.title > a').text())
# body > div.content > div.leftContent > ul > li:nth-child(1) > div.info.clear > div.title > a
# print(item.select(''))
def data_plot(data, title=''):
plt.title(title)
plt.ylabel(u"所占比例")
plt.xlabel(u'每平单价')
plt.grid()
n, bins, patches = plt.hist(data, bins=400, color='darkgreen', normed=True) #n: y轴上的值, bins: x轴上的值
plt.plot(bins[0:400], n, '.')
# print("n:",n)
# print("bins:",bins)
# print("patches:",patches)
xData = np.array(bins[0:400]).reshape(-1, 1)
yData = np.array(n).reshape(-1, 1)
model = make_pipeline(PolynomialFeatures(10), Ridge())
model.fit(xData, yData)
#regr.fit(xData, yData)
xPred = np.arange(10000, 90000, 5).reshape(-1, 1)
yPred = model.predict(xPred)
print(yPred)
# for i in range(0, len(xPred)):
# res = regr.predict(xPred[:i])
# print(res)
# yPred.append(res)
plt.plot(xPred, yPred, '--', color='red')
# y = mlab.normpdf(bins, 18000, 4600)
# plt.plot(bins, y, '--')
plt.show()
def load_draw():
data = np.load('guangzhou.npy')
nnum = len(data)
index = np.arange(nnum)
# clf = mixture.GMM(n_components=2, covariance_type='full')
# clf.fit(data)
draw_data(data)
def draw_data(data):
date = time.strftime('%Y/%m/%d', time.localtime(time.time()))
data_plot(data, u"东湖高新区二手房价格分布." + date)
def main():
for i in range(1, 60):
get_one_page(i)
time.sleep(0.5)
# 绘制条状图
prices = []
for item in total:
prices.append(int(item['price']))
prices.sort()
nnum = len(prices)
index = np.arange(nnum)
# 保存数据到文件中,避免反复的爬取数据
# np.save('guangzhou.npy', prices)
# draw_data()
# print(total)
if __name__ == '__main__':
set_ch()
# load_draw()
main()
Python链家二手房数据分析
最新推荐文章于 2024-11-04 23:18:04 发布