dataframe数据的选取和操作
import pandas as pd
import numpy as np
import cx_Oracle
con =cx_Oracle.connect('scott/123@localhost:1521/orcl',encoding='utf-8')
sql =f''' SELECT e.empno, e.ename, e.sal, e.deptno
FROM emp e
UNION ALL
SELECT deptno, NULL, AVG(e.sal), NULL
FROM emp e
GROUP BY e.deptno '''
df =pd.read_sql(sql,con)
df
print(df.info())
print(df.head(5))
print("=========================")
print(df.isnull().sum())
df.dropna(subset=['ENAME'],inplace=True)
print(df.isnull().sum())
print(df.info())
df.drop_duplicates(inplace=True)
df =df.rename(columns={'EMPNO':"员工编号",'ENAME':"员工姓名"})
data = {'name': ['Joe', 'Mike', 'Jack', 'Rose', 'David', 'Marry', 'Wansi', 'Sidy', 'Jason', 'Even'],
'age': [25, 32, 18, np.nan, 15, 20, 41, np.nan, 37, 32],
'gender': [1, 0, 1, 1, 0, 1, 0, 0, 1, 0],
'isMarried': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}
labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index=labels)
df
df[0:2]
df['a':'d']
df[(df['age']>30) | (df['isMarried']=='yes')]
df['name']
df.loc['a',:]
df.loc[['a','b','c'],:]
df.loc['a':'d', :]
df.loc[df['age']>30,:]
df.loc[df['age']>30,['name','age']]
df.loc[(df['name']=='Mike') |(df['name']=='Marry'),['name','age']]
df.iloc[0,:]
df.iloc[:3,:]
df.iloc[[1,3,5],:]
df.iloc[1:3, :]
df.iloc[1, [0,2]]
df.iloc[:3, :3]
df.ix[2,'name']
df.ix[['a','c'], [0,1,3]]
df.at['b','name']
df.iat[1,0]
df.where(df['SAL']<2000)
df.where(df['SAL']<2000,100)
df_num = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
df_num.where(df_num%2==0,-df_num)
df_num.where(df_num%2==0,-df_num)==df_num.mask(~(df_num%2==0),-df_num)
df=pd.DataFrame([[1,2,3,4],[11,22,33,44],[111,222,333,444],[1111,2222,3333,4444]],columns=['col1','col2','col3','col4'],)
print(df)
df['col1'].mean()
df.agg(['max','sum','mean'])
df_columns=df.agg({'col1':['sum','min'],'col2':['max','min'],'col3':['sum','min']})
df['col1']*5
df['col1'] * df['col2']
df.iloc[0,:]*5
df.groupby('DEPTNO').sum()
df['SAL'].groupby(df['DEPTNO']).sum()
df.groupby(['DEPTNO','ENAME']).sum()
df['SAL'].groupby([df['DEPTNO'],df['ENAME']]).sum()
mapping={‘a’:‘red’,‘b’:‘red’,‘c’:‘blue’,‘d’:‘blue’,‘e’:‘red’,‘f’:‘orange’}
map_series=pd.Series(mapping)
people.groupby(map_series,axis=1).count()
states=np.array([‘Ohio’, ‘California’, ‘California’, ‘Ohio’, ‘Ohio’])
years=np.array([2004,2005,2006,2005,2006])
df[‘data1’].groupby( [states,years] ).mean()
mapping={‘a’:‘red’,‘b’:‘red’,‘c’:‘blue’,‘d’:‘blue’,‘e’:‘red’,‘f’:‘orange’}
by_column=people.groupby(mapping,axis=1).sum()
people.groupby(len).sum()
key_list=[‘one’,‘one’,‘one’,‘two’,‘two’]
people.groupby([ len,key_list ]).min()
hier_df.groupby(level=‘cty’,axis=1).count()
df['昨天收盘价'] = df['收盘价'].shift(-1)
df['成交量_cum'] = df['成交量'].cumsum()
print((df['涨跌幅'] + 1.0).cumprod())
df.reset_index(inplace=True)
print(df.sort_values(by=['交易日期'], ascending=1))
print(df.sort_values(by=['股票名称', '交易日期'], ascending=[1, 1]))
print(df.empty)
print(pd.DataFrame().empty)
print(df.T)
print(df['收盘价'].rolling(3).max())
df['收盘价_至今均值'] = df['收盘价'].expanding().mean()