介绍下本人。中山大学,医学生+计科学生的集合体,机器学习爱好者。
一、series和df区别
s1=pd.Series([1,2,4,6,7,2])
print(s1)
# 0 1
# 1 2
# 2 4
# 3 6
# 4 7
# 5 2
# dtype: int64
s2=pd.Series([4,3,1,57,8],index=['a','b','c','d','e'])
print(s2)
# a 4
# b 3
# c 1
# d 57
# e 8
# dtype: int64
s3=pd.DataFrame([1,2,4,6,7,2])
print(s3)
# 0
# 0 1
# 1 2
# 2 4
# 3 6
# 4 7
# 5 2
s4=pd.DataFrame([4,3,1,57,8],index=['a','b','c','d','e'],columns=['A'])
print(s4)
# A
# a 4
# b 3
# c 1
# d 57
# e 8
二、series和df的rename
#series和df在rename的时候,加columns=,然后冒号后面是修改后的名字
train_nuni1 = train.nunique().reset_index()
print(train_nuni1.head())
# index 0
# 0 instance_id 1001650
# 1 time 443670
# 2 city 333
# 3 province 35
# 4 user_tags 464200
# 5 carrier 4
train_nuni2 = train.nunique().reset_index().rename(columns={'index':'feat',0: 'data_nunique'})
print(train_nuni2.head())
# feat data_nunique
# 0 instance_id 1001650
# 1 time 443670
# 2 city 333
# 3 province 35
# 4 user_tags 464200
# 5 carrier 4
adid_cnt1=data.groupby(['instance_id'])['adid'].count().reset_index()
print(adid_cnt1.head())
# instance_id adid
# 0 12204730147108 1
# 1 33640746791159 1
# 2 37672601301929 1
# 3 63921594271660 1
# 4 65319037300634 1
adid_cnt2=data.groupby(['instance_id'])['adid'].count().reset_index().rename(columns={'instance_id':'instance_id1','adid':'adid_cnt'})
print(adid_cnt2.head())
# instance_id1 adid_cnt
# 0 12204730147108 1
# 1 33640746791159 1
# 2 37672601301929 1
# 3 63921594271660 1
# 4 65319037300634 1
adid_cnt3=data.groupby(['instance_id'])['adid'].count().reset_index().rename({'instance_id':'instance_id1','adid':'adid_cnt'})
print(adid_cnt3.head())
# instance_id adid
# 0 12204730147108 1
# 1 33640746791159 1
# 2 37672601301929 1
# 3 63921594271660 1
# 4 65319037300634 1
adid_cnt4=data.groupby(['instance_id'])['adid'].count().reset_index(drop_index=True).rename({'instance_id':'instance_id1','adid':'adid_cnt'})
print(adid_cnt4.head())
#TypeError: reset_index() got an unexpected keyword argument 'drop_index'
三、注意type
train_nuni1=train.nunique()#series
print(type(train_nuni1))#<class 'pandas.core.series.Series'>
train_nuni2 = train.nunique().reset_index()#dataframe
print(type(train_nuni2))#<class 'pandas.core.frame.DataFrame'>
四、df和series的合并
train_nuni = train.nunique().reset_index().rename(columns={'index':'feat',0: 'data_nunique'})
print(train_nuni.head())
# feat data_nunique
# 0 instance_id 1001650
# 1 time 443670
# 2 city 333
# 3 province 35
# 4 user_tags 464200
test_nuni = test.nunique().reset_index().rename(columns={'index':'feat',0: 'data_nunique'})
print(test_nuni.head())
# feat data_nunique
# 0 instance_id 40024
# 1 time 30639
# 2 city 332
# 3 province 35
# 4 user_tags 20237
data_nuni = data.nunique().reset_index().rename(columns={'index':'feat1',0: 'data_nunique1'})
print(data_nuni.head())
# feat1 data_nunique1
# 0 adid 2113
# 1 advert_id 39
# 2 advert_industry_inner 25
# 3 advert_name 35
# 4 app_cate_id 23
df_nunique=pd.merge(train_nuni,test_nuni,'left').rename(columns={'feat':'feat1','data_nunique': 'data_nunique1'})
print(df_nunique.head())
# feat1 data_nunique1
# 0 instance_id 1001650
# 1 time 443670
# 2 city 333
# 3 province 35
# 4 user_tags 464200
df_nunique=pd.merge(df_nunique,data_nuni,'left').rename(columns={'feat1':'feat2','data_nunique1': 'data_nunique2'})
print(df_nunique.head())
# feat2 data_nunique2
# 0 instance_id 1001650
# 1 time 443670
# 2 city 333
# 3 province 35
# 4 user_tags 464200
df_nuni=pd.DataFrame({'train':train.nunique(),'test':test.nunique(),'data':data.nunique()})
print(df_nuni)
# data test train
# adid 2113 881.0 2079
# advert_id 39 31.0 38
# advert_industry_inner 25 21.0 24
# advert_name 35 29.0 34
# app_cate_id 23 20.0 22
# app_id 440 174.0 438
# app_paid 1 1.0 1
# campaign_id 66 45.0 64
df_nuni=pd.DataFrame({'train':train.nunique(),'test':test.nunique(),'data':data.nunique()}).reset_index()
print(df_nuni)
# index data test train
# 0 adid 2113 881.0 2079
# 1 advert_id 39 31.0 38
# 2 advert_industry_inner 25 21.0 24
# 3 advert_name 35 29.0 34
# 4 app_cate_id 23 20.0 22
# 5 app_id 440 174.0 438
总结至此,不足之处,批评指正~