# 1. Initialization#Series
s = pd.Series(list(range(4)))#list
s = pd.Series(np.arange(1,10))#array
s = pd.Series({"小红":12,"小马":11,"小王":13})#dict
s = pd.Series([1,2,3,np.nan], index=['a','b','c','d'])
s.index =list('cdef')#attribute
s.index
s.values
#DataFrame
df = pd.DataFrame(np.arange(12).reshape(3,4))
df = pd.DataFrame(np.random.randn(3,4), index =['1','2','3'], columns =['a','b','c','d'])
dates = pd.date_range('20200101', periods=4)#date
df = pd.DataFrame({'A':1.,'B':'oops','C':pd.Timestamp('20130102'),'D':dates,'E':pd.Series(list(range(4)), index =list(range(4))),'F':pd.Categorical(["blue","yellow","red","grey"])})#attribute
df.dtypes
df.index
df.columns
df.values
df.T
df.describe()
df.head()
df.tail(3)#multilevel index
data = pd.DataFrame(np.arange(12).reshape(4,3), index=[['a','a','b','b'],[1,2,1,2]],columns=[['A','A','B'],['Z','X','C']])
data.index.name=['row1','row2']
data.columns.names['col1','col2']
2.数据筛选
# 2. Data selection
dates = pd.date_range('20200101', periods=3)
df = pd.DataFrame(np.arange(12).reshape((3,4)), index=dates, columns=['a','b','c','d'])#index and slice
df.A
df['A'][:5]
df[['A','B']][3:]#select by label:loc
df.loc['20200101']
df.loc[:,'a':'c']
df.loc['20200101',['a','c']]#select by position:iloc
df.iloc[:,1:]
df.iloc[1:,2:]
df.iloc[[1,2],:]#mixes selection:ix
df.ix[:3,['a','c']]#boolean indexing
df.a>2
df[1:2]>5
df.a[df.a>2]=0#one element
df[df.a>2]=0#whole row#add one column
df['e']= np.nan
df['f']=[1,2,3]
df['g']= np.arange(3)
df['h']= pd.Series([1,2,3], index=pd.date_range('20200101', periods=3))#del one column
df = df.drop('e', axis=1)#data missing #how={'any','all'}#judge whole df
np.any(df.isnull())==True#drop
df.dropna(axis=0, how='any')#any is nan
df.dropna(axis=1, how='all')#all are nan#fill
df.fillna(value =0)
3.数据排序与导入导出
# 3. Sort#by index
df.sort_index(axis=1, ascending=True)#by value
df.sort_values(by='E')# 4. Import export# data = pd.read_csv('student.scv')# data.to_pickle('student.pickle')
4.数据合并
# 5. Data merge#concat
df1 = pd.DataFrame(np.ones((2,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((2,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((2,4))*3, columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3], axis=0, ignore_index=True)#0-column 1-row#join={'inner', 'outer'} default:outer
df1 = pd.DataFrame(np.ones((2,4))*0, columns=['a','b','c','d'], index=[1,2])
df2 = pd.DataFrame(np.ones((2,4))*1, columns=['a','b','c','d'], index=[3,4])
res = pd.concat([df1,df2], axis=1, ignore_index=True)
res = pd.concat([df1,df2], axis=1, join='inner', ignore_index=True)
res = pd.concat([df1,df2], axis=1, join_axes=[df2.index])#append
df1 = pd.DataFrame(np.ones((2,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((2,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((2,4))*3, columns=['a','b','c','d'])
series = pd.Series([4,4,4,4], index=['a','b','c','d'])
res = df1.append(df2, ignore_index=True)
res = df1.append([df2,df3], ignore_index=True)
res = df1.append(series, ignore_index=True)#merge
left = pd.DataFrame({'key':['k1','k2','k3','k4'],'A':['a','a','a','a'],'B':['b','b','b','b']})
right = pd.DataFrame({'key':['k1','k2','k3','k4'],'C':['c','c','c','c'],'D':['d','d','d','d']})
res = pd.merge(left, right, on='key')#how = {'left', 'right', 'inner', 'outer'}
left = pd.DataFrame({'key1':['k1','k2','k3','k4'],'key2':['k1','k2','k3','k4'],'A':['a','a','a','a'],'B':['b','b','b','b']})
right = pd.DataFrame({'key1':['k0','k2','k3','k4'],'key2':['k1','k2','k3','k4'],'C':['c','c','c','c'],'D':['d','d','d','d']})
res = pd.merge(left, right, on=['key1','key2'])#indicator
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')