1,爬取网站新房内容
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; MSIE 10.0; \
Windows NT 6.1; WOW64;Trident/6.0; SLCC2;.NET CLR 2.0.50727; .NET CLR 3.5.30729;\
.NET CLR 3.0.30729; InfoPath.3; .NET4.0C; .NET4.0E)',
'Accept': 'image/webp,image/*,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://www.baidu.com/link?\
url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&wd=&\
;amp;eqid=c3435a7d00006bd600000003582bfd1f',
'Connection': 'keep-alive'}
page = ('pg')
def generate_cityurl(user_in_city,region): # 生成url
cityurl = 'https://' + user_in_city + '.lianjia.com/loupan/' + region + '/#' + region
return cityurl
# return demjson.encode(res)
"""
d = json.loads(res.read().decode()).get('data')
if d is None:
print("城市首页加载完成")
return
"""
def areainfo(url):
page = ('pg')
for i in range(1, 21): # 获取1-20页的数据
if i == 1:
i = str(i)
a = (url + page + i + '/')
r = requests.get(url=a, headers=headers)
print(a)
htmlinfo = r.content
else:
i = str(i)
a = (url + page + i + '/')
print(a)
r = requests.get(url=a, headers=headers)
html2 = r.content
htmlinfo = htmlinfo + html2
time.sleep(0.5)
return htmlinfo
hlist = []
def listinfo(listhtml):
areasoup = BeautifulSoup(listhtml, 'html.parser')
ljhouse = areasoup.find_all('div', attrs={'class': 'resblock-desc-wrapper'})
for house in ljhouse:
loupantitle = house.find("div", attrs={"class": "resblock-name"})
loupanname = loupantitle.a.get_text()
loupantag = loupantitle.find_all("span")
wuye = loupantag[0].get_text()
xiaoshouzhuangtai = loupantag[1].get_text()
location = house.find("div", attrs={"class": "resblock-location"}).get_text()
jishi = house.find("a", attrs={"class": "resblock-room"}).get_text()
area = house.find("div", attrs={"class": "resblock-area"}).get_text()
tag = house.find("div", attrs={"class": "resblock-tag"}).get_text()
jiage = house.find("div", attrs={"class": "resblock-price"})
price = jiage.find("div", attrs={"class": "main-price"}).get_text()
total = jiage.find("div", attrs={"class": "second"})
totalprice = "暂无"
if total is not None:
totalprice = total.get_text()
h = {'title': loupanname, 'wuye': wuye, 'xiaoshouzhuangtai': xiaoshouzhuangtai, 'location': location.replace("\n", ""),
'jishi': jishi.replace("\n", ""), 'area': area, 'tag': tag, 'price': price,
'totalprice': totalprice};
hlist.append(h)
if __name__ == '__main__':
user_in_city = input('输入抓取城市(简称,如:成都,就输入:cd.):')
region = input('请输入区域(全拼):')
url = generate_cityurl(user_in_city,region)
print(url)
hlist.append(
{'title': "楼盘名称", 'wuye': "物业类型", 'xiaoshouzhuangtai': "销售状态", 'location': "位置",
'jishi': "房型", 'area': "面积", 'tag': "标签", 'price': "单价",
'totalprice': "总价"})
areahtml = areainfo(url)
listinfo(areahtml)
houseinfo = pd.DataFrame(hlist,
columns=['title', 'wuye', 'xiaoshouzhuangtai', 'location',
'jishi', 'area', 'tag', 'price',
'totalprice'])
houseinfo.to_csv('./链家自定义新房.csv', index=False, encoding="utf_8_sig")
1.1 结果
2, 生成信息对比图
import csv
import matplotlib.pyplot as plt
import numpy as np
csvfile = csv.reader(open('链家自定义新房.csv', 'r',encoding = 'UTF-8'))#打开CSV文件
print(csvfile)
data=[]
n = 0
blist = []
dict = {}
for xinfang in csvfile:#遍历CSV文件存入列表
data.append(xinfang)
#print(data[2][1])
#print(len(data))
#print(data[1][1])
for n in range(2,len(data)):#提取类型存入列表
blist.append(data[n][1])
#print(blist)
for key in blist:#统计不同类型个数,存入字典
dict[key] = dict.get(key,0) + 1
print(dict)
values = list(dict.values())#获取字典values
#print(values)
#生成区域房型对比图
plt.bar(range(len(values)),values)
x = range(0,6)
plt.xticks(x,('xiezilou','shangye','dishang','zhucha','shangye'))
plt.title('Comparison of Room Types in Nearby Areas')
plt.show()