pandas常用方法

最新推荐文章于 2025-04-18 08:30:03 发布

莱尼布鲁斯

最新推荐文章于 2025-04-18 08:30:03 发布

阅读量253

点赞数

分类专栏： python 文章标签： python 数据分析

本文链接：https://blog.youkuaiyun.com/qq_45769877/article/details/113209132

版权

python 专栏收录该内容

3 篇文章

订阅专栏

文章目录

1.初始化和属性

# 1. Initialization
#Series
s = pd.Series(list(range(4)))	#list
s = pd.Series(np.arange(1,10))		#array
s = pd.Series({"小红":12, "小马":11, "小王":13})		#dict
s = pd.Series([1,2,3,np.nan], index=['a','b','c','d'])
s.index = list('cdef')

#attribute
s.index
s.values

#DataFrame
df = pd.DataFrame(np.arange(12).reshape(3,4))
df = pd.DataFrame(np.random.randn(3,4), index = ['1','2','3'], columns = ['a', 'b', 'c', 'd'])
dates = pd.date_range('20200101', periods=4)	#date
df = pd.DataFrame({'A':1.,
				'B':'oops',
				'C':pd.Timestamp('20130102'),
				'D':dates,
				'E':pd.Series(list(range(4)), index = list(range(4))),
				'F':pd.Categorical(["blue","yellow","red","grey"])})
#attribute
df.dtypes
df.index
df.columns
df.values
df.T
df.describe()
df.head()
df.tail(3)


#multilevel index
data = pd.DataFrame(np.arange(12).reshape(4,3), index=[['a','a','b','b'],[1,2,1,2]],columns=[['A','A','B'],['Z','X','C']])
data.index.name=['row1','row2']
data.columns.names['col1','col2']

2.数据筛选

# 2. Data selection
dates = pd.date_range('20200101', periods=3)
df = pd.DataFrame(np.arange(12).reshape((3,4)), index=dates, columns=['a', 'b', 'c', 'd'])

#index and slice
df.A
df['A'][:5]
df[['A','B']][3:]

#select by label:loc
df.loc['20200101']
df.loc[:,'a':'c']
df.loc['20200101',['a','c']]

#select by position:iloc
df.iloc[:,1:]
df.iloc[1:,2:]
df.iloc[[1,2],:]

#mixes selection:ix
df.ix[:3,['a', 'c']]

#boolean indexing
df.a>2
df[1:2]>5
df.a[df.a>2] = 0 #one element
df[df.a>2] = 0  #whole row

#add one column
df['e'] = np.nan
df['f'] = [1,2,3]
df['g'] = np.arange(3)
df['h'] = pd.Series([1,2,3], index=pd.date_range('20200101', periods=3))
#del one column
df = df.drop('e', axis=1)  

#data missing	#how={'any','all'}
#judge whole df
np.any(df.isnull()) == True
#drop
df.dropna(axis=0, how='any')	#any is nan
df.dropna(axis=1, how='all')	#all are nan
#fill
df.fillna(value = 0)

3.数据排序与导入导出

# 3. Sort
#by index
df.sort_index(axis=1, ascending=True)
#by value
df.sort_values(by='E')

# 4. Import export
# data = pd.read_csv('student.scv')
# data.to_pickle('student.pickle')

4.数据合并

# 5. Data merge
#concat
df1 = pd.DataFrame(np.ones( (2, 4) )*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones( (2, 4) )*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones( (2, 4) )*3, columns=['a', 'b', 'c', 'd'])
res = pd.concat([df1,df2,df3], axis=0, ignore_index=True)	#0-column 1-row
#join={'inner', 'outer'}  default:outer
df1 = pd.DataFrame(np.ones( (2, 4) )*0, columns=['a', 'b', 'c', 'd'], index=[1,2])
df2 = pd.DataFrame(np.ones( (2, 4) )*1, columns=['a', 'b', 'c', 'd'], index=[3,4])
res = pd.concat([df1,df2], axis=1, ignore_index=True)
res = pd.concat([df1,df2], axis=1, join='inner', ignore_index=True)
res = pd.concat([df1,df2], axis=1, join_axes=[df2.index])

#append
df1 = pd.DataFrame(np.ones( (2, 4) )*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones( (2, 4) )*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones( (2, 4) )*3, columns=['a', 'b', 'c', 'd'])
series = pd.Series([4,4,4,4], index=['a', 'b', 'c', 'd'])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2,df3], ignore_index=True)
res = df1.append(series, ignore_index=True)

#merge
left = pd.DataFrame({'key':['k1','k2','k3','k4'],
					'A':['a','a','a','a'],
					'B':['b','b','b','b']})
right = pd.DataFrame({'key':['k1','k2','k3','k4'],
					'C':['c','c','c','c'],
					'D':['d','d','d','d']})
res = pd.merge(left, right, on='key')

#how = {'left', 'right', 'inner', 'outer'}
left = pd.DataFrame({'key1':['k1','k2','k3','k4'],
					'key2':['k1','k2','k3','k4'],
					'A':['a','a','a','a'],
					'B':['b','b','b','b']})
right = pd.DataFrame({'key1':['k0','k2','k3','k4'],
					'key2':['k1','k2','k3','k4'],
					'C':['c','c','c','c'],
					'D':['d','d','d','d']})
res = pd.merge(left, right, on=['key1', 'key2'])

#indicator
df1 = pd.DataFrame({'col1':[0, 1], 'col_left':['a', 'b']}) 
df2 = pd.DataFrame({'col1':[1,2,2], 'col_right':[2,2,2]})
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')

5.Matplotlib画图

# 6. With Matplotlib
data = pd.DataFrame(np.random.randn(1000,4), index=np.arange(1000), columns=['a','b','c','d'])
data = data.cumsum()
ax = data.plot.scatter(x='a', y='b', color='Blue', label='Class 1')
data.plot.scatter(x='a', y='c', color='Green', label='Class 2', ax=ax)
data.plot.scatter(x='b', y='c', color='Yellow', label='Class 2', ax=ax)
plt.show()

6.豆瓣电影数据

import numpy as np
import pandas as pd

df = pd.read_excel('movies_data.xlsx' ,encoding = 'utf8')

#添加一行数据
dict = {'名字':'千与千寻','投票人数':'12345','类型':'动漫','产地':'日本','上映时间':'20190101','时长':'100','年代':'2019','评分':'9.6','首映地点':'日本'}
s = pd.Series(dict)
s.name = len(df)
df = df.append(s)
#删除一行数据
df = df.drop([df.shape[0]-1])

#增加一列数据
df['序号'] = range(len(df))
#删除一列数据
df = df.drop('序号',axis=1)
#筛选产地美国或者中国大陆，评分大于9的电影
df[((df['产地']=='美国')|(df['产地']=='中国大陆'))&(df['评分']>9)]


#缺失值处理
#判断是否有缺失值
df.isnull()
df['名字'].isnull()
#填充缺失值
df['评分'].fillna(np.mean(df['评分']), inplace=True)
df['名字'].fillna('未知电影', inplace=True)
#删除缺失行
df.dropna(axis=1, inplace=True)

#数据格式转换
df['投票人数'] = df['投票人数'].astype('int')
df['产地'] = df['产地'].astype('str')
#类型错误
# df['年代'] = df['年代'].astype('int')  error '2008\u200e'
df[df.年代=='2008\u200e'].values
df.loc[15205,'年代'] = 2008

#异常值处理
#判断是否有异常值
df[df.投票人数<0]
df[df.投票人数%1!=0]
#缺失值处理
df = df[df.投票人数>0]
df = df[df.投票人数%1==0]
df.drop(df[df.投票人数<0].index, inplace=True)
df.drop(df[df.投票人数%1!=0].index, inplace=True)

#排序
df.sort_values(by='投票人数', ascending=False)
df.sort_values(by=['评分','投票人数'])

#基本统计方法
df.describe()
df['投票人数'].max()
df['投票人数'].sum()
df['评分'].mean()
df['评分'].std()
df['产地'].unique()
df['产地'].value_counts()

#数据透视
print(pd.pivot_table(df, index=['年代']))
pd.pivot_table(df, index=['年代','产地'])
pd.pivot_table(df, index=['年代'], values=['评分'])
# 数据聚合和总和   聚合函数
pd.pivot_table(df, index=['年代','产地'], values=['投票人数'],aggfunc=np.sum)
pd.pivot_table(df, index=['产地'], values=['投票人数','评分'], aggfunc=[np.sum, np.mean])
# 数据 总和ALL
pd.pivot_table(df, index=['产地'],aggfunc=[np.sum, np.mean], fill_value=0, margins = True)
# aggfunc 字典数据
pd.pivot_table(df, index=['产地'], values=['投票人数','评分'], aggfunc={'投票人数':np.sum,'评分':np.mean})
# 透视表过滤
table = pd.pivot_table(df, index = ['年代'], values=['投票人数','评分'],aggfunc={'投票人数':np.sum,'评分':np.mean}, fill_value=0)
table[table.index == 1994]
table.sort_values('评分',ascending=False)[:10]


#数据离散化
pd.cut(df['评分'], [0,3,5,7,9,10], labels=['E','D','C','B','A'])
df['评分等级']=pd.cut(df['评分'], [0,3,5,7,9,10], labels=['E','D','C','B','A'])
bins = np.percentile(df['投票人数'], [0,20,40,60,80,100])
df['热门人数']=pd.cut(df['投票人数'], bins, labels=['E','D','C','B','A'])