开场段子???? :
下面我们用平行坐标系图来直观呈现全国各省市高考难度(4维度,分别是一本录取率,211录取率,985录取率,清北录取率)。
平行坐标系图是一种适合呈现高维数据(3维以上)的图表形式。顾名思义,它使用一组平行的坐标系来表达数据,每个样本点表达为坐标系中的一条折线。
先上数据:
(数据来源:统计局,教育部,各省教育厅,国金证券研究所。西藏一本录取率为估计数值。年份为2018年)
再上图片:
然后上视频:
最后上代码:
import numpy as np
import pandas as pd
import shapely
from shapely import geometry as geo
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import imageio
import os
from PIL import Image
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['animation.writer'] = 'html'
plt.rcParams['animation.embed_limit'] = 100
def rgba_to_rgb(img_rgba):
img_rgb = Image.new("RGB", img_rgba.size, (255, 255, 255))
img_rgb.paste(img_rgba, mask=img_rgba.split()[3])
return img_rgb
def html_to_gif(html_file, gif_file, duration=0.5):
path = html_file.replace(".html","_frames")
images = [os.path.join(path,x) for x in sorted(os.listdir(path))]
frames = [imageio.imread(x) for x in images]
if frames[0].shape[-1]==4:
frames = [np.array(rgba_to_rgb(Image.fromarray(x))) for x in frames]
imageio.mimsave(gif_file, frames, 'gif', duration=duration)
return gif_file
cmap = [
'#2E91E5',
'#1CA71C',
'#DA16FF',
'#B68100',
'#EB663B',
'#00A08B',
'#FC0080',
'#6C7C32',
'#862A16',
'#620042',
'#DA60CA',
'#0D2A63']*100
df = pd.read_excel("./data/各省市高考录取率数据.xlsx")
def parallel_coordinates_dance(df,title = "中国各省市高考难度",
filename = None,
figsize = (8,5),
dpi = 144,
duration = 0.5,
anotate_points = ["江西","上海","河南","山东","西藏","北京","天津"],
line_cols = ["一本录取率","211录取率","985录取率","清北录取率"],
name_col = "省份",
text_col = "高考模式"):
fig, ax =plt.subplots(figsize=figsize,dpi=dpi)
def plot_frame(i):
ax.clear()
ax.axis("off")
n = len(line_cols)
k = i//n+1
m = i%n
dfvalues = df[line_cols].copy()
values = dfvalues.values/dfvalues.values.max(axis=0)
dfvalues = pd.DataFrame(values,columns = line_cols)
dfvalues.index = df[name_col]
dfdata = dfvalues.iloc[:k,:].copy()
ax.set_xlim(-(len(line_cols)-1)/20, 21*(len(line_cols)-1)/20)
ax.set_ylim(-0.1, 1.1)
# 绘制坐标系
for j,col in enumerate(line_cols):
y = np.array([0,1.05])
x = [j,j]
line = plt.Line2D(x,y,color="gray",linestyle="-",linewidth= 2.5,alpha = 0.5)
ax.add_artist(line)
ax.annotate(col,xy = (j,0), xycoords = "data",
xytext = (-25,-10),fontsize = 10,fontweight = "bold",
color = "black", textcoords = "offset points",zorder = 1)
# 绘制平均参考线
y = dfvalues.iloc[-1,:].values.tolist()
x = list(range(0,len(line_cols)))
line = plt.Line2D(x,y,color="gray",linestyle="--",linewidth= 3.5,alpha = 0.7)
ax.add_artist(line)
for j,(px,py) in enumerate(zip(x,y)):
s = str(df[line_cols].iloc[-1,j])+"%"
ax.annotate(s,xy = (px,py), xycoords = "data",
xytext = (-17,5),fontsize = 10,fontweight = "bold",
color = "black", textcoords = "offset points",zorder = 3)
s = dfvalues.index[-1]
ln = geo.LineString(np.array(list(zip(x,y))))
pt = np.array(ln.interpolate(0.5 ,normalized=True))
px, py = pt
ax.annotate(s,xy = (px,py), xycoords = "data",
xytext =(-5,-5),fontsize = 12,fontweight = "bold",
color = "black", textcoords = "offset points",zorder = 4)
# 绘制最高参考值
y = [1.0]*len(line_cols)
x = list(range(0,len(line_cols)))
arr_s = df[line_cols].max().to_list()
line = plt.Line2D(x,y,color="gray",linestyle="--",linewidth= 3.5,alpha = 0.7)
ax.add_artist(line)
for r,(px,py) in enumerate(zip(x,y)):
s = str(arr_s[r])+"%"
ax.annotate(s,xy = (px,py), xycoords = "data",
xytext = (-17,5),fontsize = 10,fontweight = "bold",
color = "black", textcoords = "offset points",zorder = 3)
s = "最高参考线"
ln = geo.LineString(np.array(list(zip(x,y))))
pt = np.array(ln.interpolate(0.5 ,normalized=True))
px, py = pt
ax.annotate(s,xy = (px,py), xycoords = "data",
xytext =(-25,-5),fontsize = 12,fontweight = "bold",
color = "black", textcoords = "offset points",zorder = 4)
# 绘制之前数据
for j in range(k-1):
y = dfdata.iloc[j,:].values.tolist()
x = list(range(0,len(line_cols)))
s = dfdata.index[j]
if s in anotate_points:
line = plt.Line2D(x,y,color=cmap[j],linestyle="-",linewidth= 2.5,alpha = 0.3)
ax.add_artist(line)
ln = geo.LineString(np.array(list(zip(x,y))))
np.random.seed(j)
pt = np.array(ln.interpolate(np.random.rand() ,normalized=True))
px, py = pt
ax.annotate(s,xy = (px,py), xycoords = "data",
xytext = (-25,-5),fontsize = 12,fontweight = "bold",
color = cmap[j], textcoords = "offset points",zorder = 4)
# 添加标题和排名序号
text = df[name_col].iloc[k-1]+":"+df[text_col].iloc[k-1]
ax.set_title(title,color = "black",fontsize = 12)
ax.text(0.50, 0.50, text, va="center", ha="center", color = cmap[k-1],
size = 50,alpha=0.5,transform = ax.transAxes,zorder = 6)
# 添加动画
j = k-1
y = dfdata.iloc[j,:].values.tolist()
x = list(range(0,len(line_cols)))
s = dfdata.index[j]
ax.scatter(x[0:m],y[0:m],s=50,c = cmap[j])
ax.scatter(x[m:m+1],y[m:m+1],s=100,c = cmap[j])
line = plt.Line2D(x[0:m+1],y[0:m+1],color=cmap[j],linestyle="-",linewidth= 3.5,alpha = 0.7)
ax.add_artist(line)
for r,(px,py) in enumerate(zip(x[0:m+1],y[0:m+1])):
s = str(df[line_cols].iloc[j,r])+"%"
fs = 10 if r!=m else 20
ax.annotate(s,xy = (px,py), xycoords = "data",
xytext = (-17,8),fontsize = fs,fontweight = "bold",
color = cmap[j], textcoords = "offset points",zorder = 5)
my_animation = animation.FuncAnimation(fig,plot_frame,
frames = range(0,len(line_cols)*(len(df)-1)),interval = int(duration*1000))
if filename is None:
try:
from IPython.display import HTML
HTML(my_animation.to_jshtml())
return HTML(my_animation.to_jshtml())
except ImportError:
pass
else:
my_animation.save(filename)
return filename
html_file = "中国各省市高考难度.html"
parallel_coordinates_dance(df,filename=html_file)
gif_file = html_file.replace(".html",".gif")
html_to_gif(html_file,gif_file,duration=0.8)
主要思路是使用 plt.Line2D对象来构建平行坐标系图。
收工。????
如需获取本文高考数据集和源码的同学,可以在公众号后台回复关键词:高考。添加作者微信获取。