for 遍历循环
data = [i for i in range(0,20) if i%2 ==1]
#data['content']中'IC'开头的个数
total_sum = 0
for i in data['content'].keys():
if i.startswith('IC'):
total_sum += 1
zhibiao_list = list(data['content'].keys())
zhibiao_set = set([i.split('_')[0] for i in zhibiao_list])
dict1 = {}
for i in zhibiao_set:
for j in data['content'].keys():
if i in j:
dict1[i] = dict1.get(i,0) + 1 #对于首次出现,dict1中没有这个键,所以就是0
lambda函数
pd.to_numeric转换为数值型变量
set()
data['应收金额'] = pd.to_numeric(data['应收金额'].apply(lambda x: str(x).strip('Y').strip('^')),\
errors = 'coerce')
data['实收金额'] = pd.to_numeric(data['实收金额'].apply(lambda x: str(x).replace(' ','')),\
errors = 'coerce')
data['应收金额'].apply(lambda x: str(x).strip('Y').strip('^')).astype(float)
data['实收金额'].apply(lambda x: str(x).replace(' ','')).astype(float)
df3 = pd.pivot_table(data = df2,index ='年份',columns = '国家',values = '企业',
aggfunc= lambda x: set(x))
apply(f)
P99,P1异常值替换
# 用95分位数和1分位数替换
#计算P1和P95
P_min =df['Age'].quantile(0.01)
P_max = df['Age'].quantile(0.95)
def P1_P99(x):
if x > P_max:
return P_max
elif x < P_min:
return P_min
else:
return x
#先创建一个新变量,使用apply进行替换
df['Age_new'] = df['Age'].apply(P1_P99)
离散型变量处理
def f(x):
if x=='high':
return '高层'
elif x=='middle':
return '中层'
elif x=='low':
return '底层'
else:
return '缺失值'
df['floor']=df['floor'].apply(f)
统计计算
df['price']=df['price'].apply(np.log)#取对数
map(dict)
英文名改中文名
dict1={
'chaoyang':"朝阳",
'dongcheng':"东城",
'fengtai':"丰台",
'haidian':"海淀",
'shijingshan':"石景山",
'xicheng':"西城"
}
df['dist']=df['dist'].map(dict1)