python与r

import numpy as np
import pandas as pd
df = pd.DataFrame({'col_a': np.arange(10), 
                   'col_b': np.random.randn(10), 
                   'col_c': np.random.choice(['A', 'B', 'C'], 10), 
                   'col_d': np.random.choice([0, 1], 10)})
df.head(5)

# R code:
# df <- data.frame(col_a = 0:9,
#                  col_b = rnorm(10),
#                  col_c = sample(c('A', 'B', 'C'), size = 10, replace = TRUE),
#                  col_d = sample(c(0, 1), size = 10, replace = TRUE), 
#                  stringsAsFactors = FALSE)
# head(df, 5)
col_acol_bcol_ccol_d
00-1.030159B0
111.447636B1
221.572227C0
330.031504A1
440.071705C0
print(df.shape, df.shape[0], df.shape[1])

# R code:
# dim(df), rnow(df), ncol(df)

(10, 4) 10 4
df.columns

# R code:
# names(df)
Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
# 选取前5行数据
df.iloc[:5]

# R code:
# df[1:5, ]
col_acol_bcol_ccol_d
00-1.030159B0
111.447636B1
221.572227C0
330.031504A1
440.071705C0
# 选取col_a和col_b列
df[['col_a', 'col_b']]

# R code:
# df[, c('col_a', 'col_b')]
col_acol_b
00-1.030159
111.447636
221.572227
330.031504
440.071705
55-0.284231
660.403412
771.271582
880.693771
991.510458
# 选取前5行和前2列
df.iloc[:5, :2]

# R code:
# df[1:5, 1:2]
col_acol_b
00-1.030159
111.447636
221.572227
330.031504
440.071705
# 选取单个值(scalar)
df.iat[0, 1]

# R code:
# df[1, 2]
-1.0301593908948492
df[(df['col_a'] > 3) & (df['col_b'] < 0)]
# or 
# df.query('col_a > 3 & col_b < 0')

# R code:
# df[df$col_a > 3 & df$col_b < 0, ]
col_acol_bcol_ccol_d
55-0.284231A0
df[df['col_c'].isin(['A', 'B'])]

# R code:
# df[df$col_c %in% c('A', 'B'), ]
col_acol_bcol_ccol_d
00-1.030159B0
111.447636B1
330.031504A1
55-0.284231A0
771.271582A0
880.693771B1
991.510458A1
df['col_e'] = df['col_a'] + df['col_b']
df

# df$col_e <- df$col_a + df$col_b
col_acol_bcol_ccol_dcol_e
00-1.030159B0-1.030159
111.447636B12.447636
221.572227C03.572227
330.031504A13.031504
440.071705C04.071705
55-0.284231A04.715769
660.403412C16.403412
771.271582A08.271582
880.693771B18.693771
991.510458A110.510458
# 删除col_e列
df = df.drop(columns='col_e')
df

# R code:
# df <- df[, !names(df) == 'col_e']
col_acol_bcol_ccol_d
00-1.030159B0
111.447636B1
221.572227C0
330.031504A1
440.071705C0
55-0.284231A0
660.403412C1
771.271582A0
880.693771B1
991.510458A1
# 删除第一列
df.drop(columns=df.columns[0])

# R code:
# df[, -1]
col_bcol_ccol_d
0-1.030159B0
11.447636B1
21.572227C0
30.031504A1
40.071705C0
5-0.284231A0
60.403412C1
71.271582A0
80.693771B1
91.510458A1
df.T

# R code:
# t(df)
0123456789
col_a0123456789
col_b-1.030161.447641.572230.03150430.0717051-0.2842310.4034121.271580.6937711.51046
col_cBBCACACABA
col_d0101001011
df['col_a'].astype(str)

# as.character(df$col_a)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
Name: col_a, dtype: object
pd.Categorical(df['col_c'])

# factor(df$col_d)
[B, B, C, A, C, A, C, A, B, A]
Categories (3, object): [A, B, C]
#按行统计
df[['col_a', 'col_b']].sum(axis=1)

# R code:
# apply(df[, c('col_a', 'col_b')], 1, sum)
0    -1.030159
1     2.447636
2     3.572227
3     3.031504
4     4.071705
5     4.715769
6     6.403412
7     8.271582
8     8.693771
9    10.510458
dtype: float64
#按列统计
df[['col_a', 'col_b']].mean(axis=0)

# R code:
# apply(df[, c('col_a', 'col_b')], 2, mean)
col_a    4.50000
col_b    0.56879
dtype: float64
df[['col_a', 'col_b']].apply(lambda x: x.mean() + 10)

# R code:
# apply(df[, c('col_a', 'col_b')], 2, function(x) mean(x) + 10)
col_a    14.50000
col_b    10.56879
dtype: float64
df2 = pd.DataFrame({'col_x': np.arange(10), 
                    'col_y': np.arange(10)[::-1]})
df2
col_xcol_y
009
118
227
336
445
554
663
772
881
990
#数据合并
#合并列
pd.concat([df, df2], axis=1)


# R code:
# cbind(df, df2)
col_acol_bcol_ccol_dcol_xcol_y
00-1.030159B009
111.447636B118
221.572227C027
330.031504A136
440.071705C045
55-0.284231A054
660.403412C163
771.271582A072
880.693771B181
991.510458A190
df3 = pd.DataFrame({'col_a': [-1, -2], 
                    'col_b' : [0, 1], 
                    'col_c': ['B', 'C'], 
                    'col_d': [1, 0]})
df3
col_acol_bcol_ccol_d
0-10B1
1-21C0
pd.concat([df, df3], axis=0, ignore_index=True)

# R code:
# rbind(df, df3)
col_acol_bcol_ccol_d
00-1.030159B0
111.447636B1
221.572227C0
330.031504A1
440.071705C0
55-0.284231A0
660.403412C1
771.271582A0
880.693771B1
991.510458A1
10-10.000000B1
11-21.000000C0

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值