numpy pandas 总结
import numpy as np
ar = np.array([[[1,2,3,4,5,6],[2,3,4,5,6,7],[3,4,5,6,7,8]]])
print(ar,ar.ndim)
print(ar.shape)
print(ar.data)
ar1 = np.array(range(10))
ar2 = np.arange(10)
ar3 = np.array([[1,2,3,4,5,6]])
ar4 = np.random.rand(10).reshape(2,5)
print(ar1,ar2,ar3,ar3.ndim,ar4)
print(np.arange(10000),np.arange(5,10))
print(np.linspace(10,20,num = 21))
print(np.array(range(10,20,2)))
import numpy as np
ar1 = np.arange(10)
ar2 = np.zeros((2,5))
ar3 = np.linspace(10,20,num=11)
ar4 = np.array(range(10,21,2))
print(ar1,ar2,ar3,ar4)
print(ar4.reshape(3,2))
a = np.arange(5)
b = np.arange(5,10)
c = np.array([[1],[2],[3]])
d = np.array([['a'],['b'],['c']])
print(np.hstack((c,d)))
print(np.vstack((c,d)))
print(np.hstack((a,b)))
print(np.vstack((a,b)))
print("----------------")
ar1 =np.arange(16).reshape(4,4)
print(ar1)
print(np.hsplit(ar1,4))
print(np.vsplit(ar1,4))
a =np.random.normal(size=(4,4))
b =np.random.rand(4,4)
c =np.random.randn(4,4)
d =np.random.randint(10,size=(2,5))
print(a,b)
print(c,d)
import numpy as np
import pandas as pd
a =np.random.rand(5)
print(a)
print(pd.Series(a,index=list('abcde')))
print(list(pd.Series(a).index))
print(pd.Series(a).values)
import numpy as np
import pandas as pd
s = pd.Series(np.random.rand(10))
s1 = s.drop(1)
s2 = s.drop([1,2,3])
print(s1,s2)
s =pd.Series(np.arange(5))
print(s)
s[5] = 100
s[6] = 200
print(s)
a =pd.Series(np.arange(5))
b =pd.Series(np.arange(5,10))
print(a,b)
print(a.append(b))
data ={'name':['jack','tom','marry'],
'age':[18,19,20],
'gender':['m','m','w']}
frame = pd.DataFrame(data)
print(frame)
print(list(frame.columns))
print(list(frame.index))
print(frame.values)
data1 = {'a':[1,2,3],'b':[4,5,6],'c':[7,8,9]}
data2 = {'one':np.random.rand(3)
,'two':np.arange(3)}
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2,index=['a','b','c'])
print(df1)
print(df2)
ar = np.random.rand(9).reshape(3,3)
print(ar)
df1 = pd.DataFrame(ar,index=['a','b','c'],columns=['one','two','three'])
print(df1)
import numpy as np
import pandas as pd
ar = np.random.rand(9).reshape(3,3)
df1 = pd.DataFrame(ar,index=['a','b','c'],columns=['one','two','three'])
print(df1)
print(df1.loc[['a','c']])
print(df1[:2])
print(df1[::2])
print(df1['one'])
print(df1[df1['one']>0.3])
print(df1.loc['a']>0.3)
print(df1['one'].loc[['a','c']])
ar = np.random.rand(9).reshape(3,3)
df1 = pd.DataFrame(ar,index=['a','b','c'],columns=['one','two','three'])
print(df1)
print(df1.sort_index('a',ascending= True))
import numpy as np
import pandas as pd
from datetime import datetime
print(datetime.now())
date1 = '20170101'
date2 = datetime(2016,10,1,15,0)
print(pd.Timestamp(date1))
print(pd.Timestamp(date2))
date3 = ['2017-12-10','2017-12-10','2019-12-10','aaaa']
print(pd.to_datetime(date3,errors='ignore'))
print(pd.to_datetime(date3,errors='coerce'))
date3 = ['2017-12-10','2017-12-10','2019-12-10','aaaa']
date4 = pd.to_datetime(date3,errors='coerce')
print(date4)
print(pd.DatetimeIndex(date4)[0])
print(pd.to_datetime(date3,errors='coerce'))
d = ['2017-12-10','2017-12-10','2019-12-10']
c = pd.DatetimeIndex(d)
print(c)
a = pd.Series(np.random.rand(len(c)),index= c)
b =pd.DataFrame(np.random.rand(len(c)),index= c,columns=['no'])
print(a)
print(b)
rng1 = pd.date_range('2017/1/1','2017/6/1',name= 'hello',freq='2M')
rng2 = pd.date_range(start='2017/1/1',periods=10)
rng3 = pd.period_range('2017/1/1','2017/6/1',name= 'hello',freq='2M')
print(rng1)
print(rng3)
print(rng2)
import numpy as np
import pandas as pd
rng = pd.date_range('20170101',periods=12)
ts = pd.Series(np.arange(12),index=rng)
print(ts)
ts_re =ts.resample('5D')
ts_re1 =ts.resample('5D').sum()
print(ts_re)
print(ts_re1)
print(ts.resample('5D').mean())
print(ts.resample('5D').max())
print(ts.resample('5D').min())
print(ts.resample('5D').median())
print(ts.resample('5D').ohlc())
df = pd.DataFrame({'key1':[4,3,5,np.nan,6],
'key2':[1,2,np.nan,4,5],
'key3':[1,2,3,'j','k']},index=['a','b','c','d','e'])
print(df)
print(df.mean())
print(df.mean(axis=1))
print(df.mean(axis=1,skipna=False))
print(df.count())
print(df.median())
print(df.std(),'\n',df.var())
print(df.skew(),'\n',df.kurt())
df['key1_s'] = df['key1'].cumsum()
df['key2_s'] =df['key2'].cumsum()
print(df)
s = pd.Series(list('asdvadcfgg'))
sq = s.unique()
print(s)
print(sq)
print(pd.Series(sq))
s = pd.Series(['A','b','C','bhhello','1,2,3',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
'key2':['hee','fv','w','hijv','1,2,3',np.nan]})
print(s)
print(df)
print(s.str.count('b'))
print(df['key1'].str.upper())
print(df['key2'].str.lower())
print(s.str.split(',',expand=True,n=6))
s = pd.Series(['A,bc','bc','Cc','bhhello','1,2,3',np.nan,'hj'])
df = pd.DataFrame({'key1':list('abcdef'),
'key2':['hee','fv','w','hijv','1,2,3',np.nan]})
print(s.str.split(',',expand=True,n=3))
print(df['key2'].str.split(',',expand=True,n=3))
on=
how = 'inner' 交集
how = 'outer' 并集
how = 'left' 左连接
how = 'right' 右连接
left_on='',right_on=''
s1 = pd.Series([1,2,3])
s2 = pd.Series([3,4,5])
s3 = pd.Series([1,2,3],index=['a','c','h'])
s4 = pd.Series([2,3,4],index=['b','e','d'])
print(s3)
print(s4)
print(pd.concat([s3,s4],axis=1))
import numpy as np
import pandas as pd
date = ['2017-5-1','2017-5-2','2017-5-3']*3
rng = pd.to_datetime(date)
print(rng)
df = pd.DataFrame({'date':rng,
'key':'abcdabcda',
'values':np.random.rand(9)*10})
print(df)
print('____________')
print(pd.pivot_table(df,values='values',index='date',aggfunc='var'))
print(pd.pivot_table(df,values='values',index='date',aggfunc='var',columns='key'))
df = pd.DataFrame({'A':[1,2,2,2,2],
'B':[3,3,4,4,4],
'C':[1,1,np.nan,1,1]})
print(df)
print('____________')
print(pd.crosstab(df['A'],df['B']))
print(pd.crosstab(df['A'],df['B'],normalize=True))
print('____________')
print(pd.crosstab(df['A'],df['B'],values=df['C'],aggfunc='sum'))
pd.read_table('date.txt',delimit=',',header=0,index=1)
data2 = pd.read_csv('data2.csv',engine='python',encording='utf8')
data3 = pd.read_excel('data2.xlsx',header=0,sheetname=[0,1])