把数据中的第一列设置为index:
import numpy as np
import pandas as pd
import os
import csv
path=os.getcwd()+'\\data.csv'
f=open(path)
users=pd.read_csv(f,index_col='item')
print(users.head(10))
列包括数据的数量:
users['high'].nunique()
数据描述性统计:
users.describe()
users.high.describe()#仅选一列进行描述性统计
某一类的最大值
chipo.sort_values(by='high',ascending=False).head(1)
某一类的最小值
chipo.sort_values(by='high',ascending=True).head(1)
某一类别的次数以及一类别大于多少
chipo_drink_steak_bowl = chipo[(chipo.item_name == "Canned Soda") & (chipo.quantity > 1)]
选择数据的行和列:iloc
euro12.iloc[:,0:6]
选择数据的行和列:loc
army.loc[['Maine','Alaska'] , ["deaths","size","deserters"]]
选择列数据:
army['Goals']
army[['Goals','Shots on target']]#多个列时,需要加两个中括号
按条件选择数据
army[army['Goals']>3]
army[(army['Goals']>3)&(army['Goals']<5)]
army[(army['Goals']>3)|(army['Goals']<5)]
聚类:groupby:
drinks.groupby('continent').beer_servings.mean()
drinks.groupby('continent').wine_servings.describe()
drinks.groupby('continent').spirit_servings.agg(['mean', 'min', 'max'])#agg则包含了多个参数