1.rest_index
当对DataFrame数据进行删除缺失值后,需要对其index进行调整:将reset_index的drop值设置为False表示重新排index从0开始,一般是不进行调整的,这样可以保持原有的数据索引。
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index = dates ,columns=list('ABCD'))
print df
df1 = df.reindex(index = dates[0:4],columns=list(df.columns)+ ['E'])
df1.loc[dates[0],'E'] =1
df1.loc[dates[2],'E'] =1
print df1
df1 = df1[df1.E== 1].reset_index(drop=False)
print df1
A B C D
2013-01-01 -0.181389 -0.088044 -0.459681 -0.201029
2013-01-02 1.675779 -0.554188 0.973400 0.891679
2013-01-03 -1.030581 -0.100559 -0.342108 0.189982
2013-01-04 1.398232 -0.986472 -1.517883 0.358021
2013-01-05 0.321571 -0.629422 -0.032916 0.414784
2013-01-06 -0.482070 0.127791 -0.941574 1.121891
A B C D E
2013-01-01 -0.181389 -0.088044 -0.459681 -0.201029 1.0
2013-01-02 1.675779 -0.554188 0.973400 0.891679 NaN
2013-01-03 -1.030581 -0.100559 -0.342108 0.189982 1.0
2013-01-04 1.398232 -0.986472 -1.517883 0.358021 NaN
index A B C D E
0 2013-01-01 -0.181389 -0.088044 -0.459681 -0.201029 1.0
1 2013-01-03 -1.030581 -0.100559 -0.342108 0.189982 1.0
df1 = df1[df1.E== 1].reset_index(drop=True)
print df1
A B C D
2013-01-01 0.413498 -1.059979 -1.023005 0.264287
2013-01-02 -0.483094 -0.573347 -1.563777 -0.098674
2013-01-03 1.807314 0.279142 -1.288200 1.786838
2013-01-04 -0.726217 -0.264770 1.457457 0.556120
2013-01-05 -0.132457 0.241934 0.716555 -0.131015
2013-01-06 0.413555 1.040257 -0.394696 0.270663
A B C D E
2013-01-01 0.413498 -1.059979 -1.023005 0.264287 1.0
2013-01-02 -0.483094 -0.573347 -1.563777 -0.098674 NaN
2013-01-03 1.807314 0.279142 -1.288200 1.786838 1.0
2013-01-04 -0.726217 -0.264770 1.457457 0.556120 NaN
A B C D E
0 0.413498 -1.059979 -1.023005 0.264287 1.0
1 1.807314 0.279142 -1.288200 1.786838 1.0
2. preprocessing.LabelEncoder 标签编码
#简单来说 LabelEncoder 是对不连续的数字或者文本进行编号
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit([1,5,67,100])
label = le.transform([1,1,100,67,5])
print label
[0 0 3 2 1]
3. 列表推导式
创建一个包含1到10的平方的列表
squares = []
for x in range(10):
squares.append(x**2)
print squares
squares = [x**2 for x in range(10)]
print squares
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
整除3的数字列表
numbers = []
for x in range(10):
if x % 3 == 0:
numbers.append(x)
print numbers
numbers = [x for x in range(10) if x % 3 == 0]
print numbers
[0, 3, 6, 9]
[0, 3, 6, 9]
col = ['a','b','c','d']
f = [x for x in col if x not in ['a','c']]
print f
['b', 'd']
4.concat
可以将数据根据不同的轴作简单的融合
df1 = pd.DataFrame(np.random.randn(2,3),columns=list('ABC'))
print df1
df2 = pd.DataFrame(np.random.randn(2,3),index = [5,6],columns=list('ABC'))
print df2
result = pd.concat([df1,df2])
print result
A B C
0 -2.094518 -0.077125 1.66726
1 -0.044491 -0.000471 0.94399
A B C
5 -0.384184 -0.827485 -1.222335
6 0.344529 0.834145 -0.537666
A B C
0 -2.094518 -0.077125 1.667260
1 -0.044491 -0.000471 0.943990
5 -0.384184 -0.827485 -1.222335
6 0.344529 0.834145 -0.537666
df1 = pd.DataFrame(np.random.randn(2,3),columns=list('ABC'))
print df1
df2 = pd.DataFrame(np.random.randn(2,3),columns=list('DEF'))
print df2
result = pd.concat([df1,df2])
print result
A B C
0 1.385359 -0.924547 0.189359
1 1.530118 1.179600 1.187794
D E F
0 -0.903516 -1.256129 1.525891
1 0.387177 -0.316178 0.837227
A B C D E F
0 1.385359 -0.924547 0.189359 NaN NaN NaN
1 1.530118 1.179600 1.187794 NaN NaN NaN
0 NaN NaN NaN -0.903516 -1.256129 1.525891
1 NaN NaN NaN 0.387177 -0.316178 0.837227
df1 = pd.DataFrame(np.random.randn(2,3),columns=list('ABC'))
print df1
df2 = pd.DataFrame(np.random.randn(2,3),columns=list('ABC'))
print df2
result = pd.concat([df1,df2],axis = 1)
print result
A B C
0 -0.090930 -0.529010 2.109190
1 -0.476415 -1.586248 -0.516491
A B C
0 2.063089 -1.651642 -0.538985
1 0.769949 0.436256 -0.466150
A B C A B C
0 -0.090930 -0.529010 2.109190 2.063089 -1.651642 -0.538985
1 -0.476415 -1.586248 -0.516491 0.769949 0.436256 -0.466150