利用连接散点图探索全球的饮用水情况

利用连接散点图探索全球的饮用水情况

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import rankdata

数据探索

以下数据如果有需要的同学可关注公众号HsuHeinrich,回复【数据可视化】自动获取~

# 导入数据
data_raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-04/water.csv")

# 数据预处理
data = (
    data_raw
    .dropna(subset=["install_year", "country_name"]) # 剔除空数据
    .assign(
        install_decade = (data_raw["install_year"] // 10) * 10, # 将年份转为年代,即每10年的年初
    )
    .query("1970 <= install_decade <= 2020") # 筛选1970~2020年的数据
)

# 计算新安装量
data["nb_install"] = data.groupby(["country_name", "install_decade"])["country_name"].transform("size") 

# 长宽数据转化-保留1970~2020的数据
data_pivoted = pd.pivot_table(
    data, index="country_name", values="nb_install", columns="install_decade"
).dropna().reset_index()

data = pd.melt(data_pivoted, id_vars = "country_name", value_name="nb_install", var_name="decade")

# 处理数据,生成rank(表示每decade新安装量的降序排名)
data["country_name"] = data["country_name"].str.upper()
data["rank"] = data.groupby("decade")["nb_install"].transform(lambda x: rankdata(-x))
data = data.sort_values(["country_name", "decade"])

data.head()

image-20240129173348659

country_name:名称

decade:年代

nb_install:安装量

rank:排名

绘制连接散点图-突出显示指定国家

  1. 绘制第一个国家,理解本图的组成部分
COUNTRIES = data["country_name"].unique()
COUNTRY = COUNTRIES[0]
# 初始化布局
fig, ax = plt.subplots(figsize=(9, 6))

# 反转y轴
ax.invert_yaxis()

# 循环遍历国家
for country in COUNTRIES:
    d = data[data["country_name"] == country] 
    x = d["decade"].values
    y = d["rank"].values
    
    # 突出显示指定国家
    if country == COUNTRY:
        ax.plot(x, y, color="#0b53c1", lw=2.4, zorder=10)
        ax.scatter(x, y, fc="w", ec="#0b53c1", s=60, lw=2.4, zorder=12)      
    # 其余国家不突出显示
    else:
        ax.plot(x, y, color="#BFBFBF", lw=1.5)

output_8_0

  1. 添加适当的注释信息
def add_label(x, y, fontsize, ax):
    '''
    x:decade取值;y:rank取值
    在对应的点附近添加年代和排名信息
    '''
    PAD = 0.4
    ax.annotate(
        f"#{int(y)} in\n{str(int(x))[2:]}s", 
        xy=(x, y - PAD), 
        ha="center", 
        va="bottom",
        fontsize=fontsize,
        fontname="Lato",
        zorder=12
    )
# 初始化布局
fig, ax = plt.subplots(figsize=(9, 6))
ax.invert_yaxis()

for country in COUNTRIES:
    d = data[data["country_name"] == country] 
    x = d["decade"].values
    y = d["rank"].values
    
    if country == COUNTRY:
        ax.plot(x, y, color="#0b53c1", lw=2.4, zorder=10)
        ax.scatter(x, y, fc="w", ec="#0b53c1", s=60, lw=2.4, zorder=12)
        
        # 添加文本信息(首尾点上)
        add_label(x[0], y[0], 16, ax)
        add_label(x[-1], y[-1], 16,ax)
    else:
        ax.plot(x, y, color="#BFBFBF", lw=1.5)

# 删除y刻度
ax.set_yticks([])

# x刻度
ax.set_xticks([1970, 1980, 1990, 2000, 2010, 2020])

# x刻度标签
ax.set_xticklabels(
    ["70s", "80s", "90s", "00s", "10s", "20s"], 
    fontsize=16,
    fontfamily="Inconsolata"
)
# 删除底部刻度线
ax.tick_params(bottom=False)

# 删除边框
ax.set_frame_on(False)

# 添加标题
ax.set_title(COUNTRY, fontfamily="Inconsolata", fontsize=24, fontweight=500);

output_11_0

  1. 在以上的基础上,绘制多图(多个国家的排名)
def plot_country(country, data, annotate, ax):
    '''
    将上述单个国家的绘制过程写入函数,annotate为控制变量(是否添加文本注释)
    '''
    
    for country_inner in COUNTRIES:
        d = data[data["country_name"] == country_inner] 
        x = d["decade"].values
        y = d["rank"].values

        if country_inner == country:
            ax.plot(x, y, color="#0b53c1", lw=2.4, zorder=10)
            ax.scatter(x, y, fc="w", ec="#0b53c1", s=60, lw=2.4, zorder=12)
            if annotate:
                add_label(x[0], y[0], 10, ax)
                add_label(x[-1], y[-1], 10, ax)
        else:
            ax.plot(x, y, color="#BFBFBF", lw=1.5)

    ax.set_yticks([])
    ax.set_xticks([1970, 1980, 1990, 2000, 2010, 2020])
    ax.set_xticklabels(
        ["70s", "80s", "90s", "00s", "10s", "20s"], 
        fontsize=10,
        fontfamily="Inconsolata"
    )
    ax.tick_params(bottom=False)
    ax.set_frame_on(False)

    ax.set_title(country, fontfamily="Inconsolata", fontsize=14, fontweight=500)
    
    return ax
# 初始化布局
fig, axes = plt.subplots(3, 4, sharex=True, sharey=True, figsize=(14, 7.5))

for idx, (ax, country) in enumerate(zip(axes.ravel(), COUNTRIES)):
    # 仅第一个国家添加文本注释
    annotate = idx == 0
    plot_country(country, data, annotate, ax)

# 反转y轴
ax.invert_yaxis()

# 调整布局
fig.subplots_adjust(wspace=0.1, left=0.025, right=0.975, bottom=0.11, top=0.82)

# 标题
fig.text(
    x=0.5, 
    y=0.92,
    s="RANKING SOME COUNTRIES BY THE NUMBER\nOF WATER SOURCES INSTALLATIONS BY DECADE",
    ha="center",
    va="center",
    ma="center",
    fontsize=22,
    fontweight="bold",
    fontname="Inconsolata"
)

# 著作信息-数据来源
fig.text(
    x=0.975,
    y=0.05,
    s="Data from Water Point Data Exchange",
    ha="right",
    ma="right",
    fontsize=8
)

# 著作信息-作者
fig.text(
    x=0.975,
    y=0.03,
    s="@issa_madjid",
    ha="right",
    ma="right",
    fontsize=8,
    fontweight="bold", 
)

# 推特徽标
twitter_symbol = "\uf099"

fig.text(
    x=0.925,
    y=0.03,
    s=twitter_symbol,
    ha="right",
    ma="right",
    fontsize=8,
    fontweight="bold",
    fontfamily="Font Awesome 5 Brands"
)

# 背景色
fig.set_facecolor("#f9fbfc")

output_14_0

参考:Multi panel highlighted lineplots with Matplotlib

共勉~

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值