记录常用的pandas操作
import pandas as pd
#读csv文件
data= pd.read_csv(r'D:\station_choose.csv',encoding='gbk')
#读excel文件
data= pd.read_excel(r'D:\前1000查询线路.xlsx',encoding='gbk')
#查看列数和行数
data.shape
#查看类型
data.dtypes
#查看数据内容
data.head(20)
#填充nan值
data.fillna(0)
#除法
data['percent']=data['click']/data['show']
#删除列
data.drop(columns=['index'],inplace=True)
#只选取某些列
city_map=pd.DataFrame(city_map,columns=['station_name','city_name'])
#列重命名
data.rename(columns={'name': 'com_city_name','common_city_id':'com_city_id'}, inplace=True)
#排序
data=data.sort_values(by=['percent','not_find_num','find_num'],ascending=False).reset_index()
#新建一列
data['cross_car']=1
#join操作
merge_data=pd.merge(left_data,right_data,how='left',on=['from_place','to_place']).fillna(0)
merge_data=pd.merge(left_data,right_data,how='inner',left_on='from_id',right_on='id')
#保存操作
columns = ['from_place','to_place','find_num','not_find_num','total_num','percent']
merge_data.to_csv("D:/seach_percent_10.15-11.15.csv",encoding='gbk',index=False,columns=columns)
#保存为json
data.to_json(json_path, orient='records',lines =True,force_ascii=False)
#过滤filter
data[data['total_type']==7].head()
data.query('category==2.0').head()
#双条件filter
result=data[(data['from_place']!='重庆') &(data['to_place']!='重庆')]
#两个表前后拼接
merge_data= pd.concat([data1,data2])
#group by操作
data=data.groupby(['start_city','end_city']).agg({"num":"sum"}).reset_index()
#row_number函数
#partionby分组字段,orderby排序的字段,是否降序
def row_number(dataset, partionby, orderby,isasc):
return dataset[orderby].groupby(dataset[partionby]).rank(ascending=isasc, method='first')
#sql: row_number over (partition by start_station_code,end_station_code,transfer_city_id order by 7day_order_num desc) as order_rank
station_choose['order_rank']=station_choose.groupby(['start_station_code','end_station_code','transfer_city_id'])['7day_order_num'].rank(ascending=False, method='first')
#sql if-else操作,如果end_city_name为null,用end_city替代,不为null用end_city_name字段的值填充
all_result['end_city_name']=all_result['end_city_name'].fillna(all_result['end_city'])
#sql , if (source=='value',num,0) as new_num
def label_race (row,value):
if row['source']==value:
return row['num']
else:
return 0
all_result['new_num']=all_result.apply (lambda row: label_race (row,0),axis=1)
# sql ,if-else
data['click']=data.apply(lambda x: x['nums'] if x['isclick'] ==1 else 0, axis=1)