from __future__ import absolute_import,division,print_function import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.pyplot import GridSpec import seaborn as sns import numpy as np import pandas as pda import os ,sys from tqdm import tqdm import warnings warnings.filterwarnings("ignore") sns.set_context("poster",font_scale=1.3) import missingno as msno import pandas_profiling from sklearn.datasets import make_blobs import time #读入数据 data=pda.read_csv("redcard.csv.gz",compression="gzip") print("==================开始分析数据==================================") def load_subgroup(filename,index_col=[0]): return pda.read_csv(filename,compression="gzip",index_col=index_col,encoding="UTF-8") players=load_subgroup("raw_players.csv.gz") print(players.head()) print(players.shape) msno.matrix(players,figsize=(16,7),width_ratios=(15,1)) msno.bar(players.sample(500),color="r") msno.heatmap(players,figsize=(16,7))#缺失值比例关系 plt.show() print("样本数量:",len(players)) print("rater1缺失数量:",len(players[pda.isnull(players["rater1"])])) print("rater2缺失数量:",len(players[pda.isnull(players["rater2"])])) print("rater1,2都缺失数量:",len(players[pda.isnull(players["rater1"])&pda.isnull(players["rater2"])])) #费缺失值 print("rater1非缺失数量:",len(players[players.rater1.notnull()])) players=players[players.rater1.notnull()] msno.bar(players,color="r") plt.show() fig,ax=plt.subplots(figsize=(12,8)) sns.heatmap(pda.crosstab(players.rater1,players.rater2),cmap="Blues",annot=True,fmt="d",ax=ax) ax.set_title("Correlation between Rater 1 and Rater2\n") fig.tight_layout() plt.show() print("=========================") print(pda.crosstab(players.rater1,players.rater2)) players["skinone"]=players[["rater1","rater2"]].mean(axis=1) print(players.head()) sns.distplot(players["skinone"],kde=True)#直方图 sns.distplot(players["skinone"],kde=False)#直方图 plt.show() # fig,ax=plt.subplots(figsize=(12,10)) players.position.value_counts(dropna=False,ascending=True).plot(kind="barh",ax=ax) ax.set_ylabel("Postion") ax.set_xlabel("Counts") fig.tight_layout() plt.show() position_types=players.position.unique() print(position_types) defense=['Center Back', 'Defensive Midfielder','Left Fullback','Right Fullback'] midfield=[ 'Right Midfielder','Center Midfielder','Left Midfielder'] forword=['Attacking Midfielder','Left Winger', 'Right Winger','Center Forward'] keeper=['Goalkeeper'] players.loc[players["position"].isin(defense),"postion_new"]="Defense" players.loc[players["position"].isin(midfield),"postion_new"]="Midfield" players.loc[players["position"].isin(forword),"postion_new"]="Forword" players.loc[players["position"].isin(keeper),"postion_new"]="Keeper" print(players.head()) fig,ax=plt.subplots(figsize=(12,10)) players.postion_new.value_counts(dropna=False,ascending=True).plot(kind="barh",ax=ax) ax.set_ylabel("postion_new") ax.set_xlabel("Counts") fig.tight_layout() plt.show()
本文通过使用Python的数据处理库如Pandas和可视化库Matplotlib等,对足球运动员数据进行深入分析,包括数据清洗、缺失值处理、相关性分析及位置分布等。
1788

被折叠的 条评论
为什么被折叠?



