#通过索引是视图 不是副本
1.导包
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
2.Series(数据类型)
2.1 创建Series
obj = Series( [ 4 , 7 , - 5 , 3 ] )
obj. values
array( [ 4 , 7 , - 5 , 3 ] , dtype= int64)
obj. index
RangeIndex( start= 0 , stop= 4 , step= 1 )
obj2 = Series( [ 4 , 7 , - 5 , 3 ] , index= [ 'd' , 'b' , 'a' , 'c' ] )
sdata = { 'Ohio' : 35000 , 'Texas' : 71000 , 'Oregon' : 16000 , 'Utah' : 5000 }
obj3 = Series( sdata)
2.2 取值
obj2[ 'a' ]
obj2[ 'a' , 'b' ]
'b' in obj2
pd. isnull( obj4)
pd. notnull( obj4)
3.DataFrame(数据类型)
3.1 定义
DataFrame 是一个表格型的数据结构,具有行索引和列索引.每列的数据可以不相同,但是公用一个索引(行,列)
3.2 创建
data = { 'state' : [ 'Ohio' , 'Ohio' , 'Ohio' , 'Nevada' , 'Nevada' ] ,
'year' : [ 2000 , 2001 , 2002 , 2001 , 2002 ] ,
'pop' : [ 1.5 , 1.7 , 3.6 , 2.4 , 2.9 ] }
frame = DataFrame( data)
frame_test = DataFrame( data, index= [ 'a' , 'b' , 'c' , 'd' , 'e' ] )
3.3 简单的增删改查
frame[ 'state' ]
frame. state
frame[ 'debt' ] = np. arange( 5 )
frame[ 'debt' ] = 16.5
val = Series( [ 2 , 3 ] , index= [ 1 , 2 ] )
frame. loc[ 1 : 2 , 'year' ] = val
del frame[ 'year' ]
3.4 索引对象(暂时不讲)
4.基本功能
4.1 重新索引
功能:pandas中的reindex方法可以为series和dataframe添加或者删除索引。
方法:serise.reindex()、dataframe.reindex()
如果新添加的索引在原数组的索引中不存在,则默认为nan。如果减少索引,就相当于一个切片操作。
obj = Series( [ 4.5 , 7.2 , - 5.3 , 3.6 ] , index= [ 'd' , 'b' , 'a' , 'c' ] )
d 4.5
b 7.2
a - 5.3
c 3.6
obj2 = obj. reindex( [ 'a' , 'b' , 'c' , 'd' , 'e' ] , fill_value= 0 )
obj3 = Series( [ 'blue' , 'purple' , 'yellow' ] , index= [ 0 , 2 , 4 ] )
obj3. reindex( range ( 6 ) , method= 'ffill' )
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame = DataFrame( np. arange( 9 ) . reshape( 3 , 3 ) , index= [ 'a' , 'c' , 'd' ] ,
columns= [ 'Ohio' , 'Texas' , 'California' ] )
frame. reindex( columns= [ 'Texas' , 'Utah' , 'California' ] )
method参数选项 说明 ffill或pad 前向填充(或搬运)值 bfill或backfill 后向填充(或搬运)值
reindex函数的参数
参数 说明 index 行索引 column 列索引 method 插值(填充)方式 fill_value 缺失值的替代值 limit 前向或后向填充是的最大填充量 copy 默认为True.无论如何都复制.如果为False,则新旧相等就不复制
frame. drop( [ 'Ohio' , 'Texas' ] , axis= 1 )
4.2 选取值
data = DataFrame( np. arange( 16 ) . reshape( 4 , 4 ) ,
index= [ 'Ohio' , 'Colorado' , 'Utah' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data = DataFrame( np. arange( 16 ) . reshape( 4 , 4 ) ,
index= [ 'Ohio' , 'Colorado' , 'Utah' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
data
one two three four Ohio 0 1 2 3 Colorado 4 5 6 7 Utah 8 9 10 11 New York 12 13 14 15
data. loc[ [ 'Colorado' , 'Utah' ] , [ 'two' , 'three' ] ]
two three Colorado 5 6 Utah 9 10
data. ix[ [ 'Colorado' , 'Utah' ] , [ 'two' , 'three' ] ]
D:\anaconda\lib\site-packages\ipykernel_launcher.py:1: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
two three Colorado 5 6 Utah 9 10
DataFrame 索引选项
类型 说明 obj[val] 选取DataFrame的一组列. obj.ix[val] 选取单行或单列 reindex[val,val2] 同时选多行或多列 obj.loc[val1,val2] 同时选行和列
4.4 算术运算和数据对齐
df1 = DataFrame( np. arange( 12 ) . reshape( 3 , 4 ) , columns= list ( 'abcd' ) )
df1
df2 = DataFrame( np. arange( 20 ) . reshape( 4 , 5 ) , columns= list ( 'abcde' ) )
df2
a b c d e 0 0 1 2 3 4 1 5 6 7 8 9 2 10 11 12 13 14 3 15 16 17 18 19
df1 + df2
a b c d e 0 0.0 2.0 4.0 6.0 NaN 1 9.0 11.0 13.0 15.0 NaN 2 18.0 20.0 22.0 24.0 NaN 3 NaN
df1. add( df2, fill_value= 0 )
a b c d e 0 0.0 2.0 4.0 6.0 4.0 1 9.0 11.0 13.0 15.0 9.0 2 18.0 20.0 22.0 24.0 14.0 3 15.0 16.0 17.0 18.0 19.0
4.5 DataFrame和Series之间的运算
frame = DataFrame( np. arange( 12 ) . reshape( 4 , 3 ) , columns= list ( 'bde' ) ,
index= [ 'Utah' , 'Ohio' , 'Texas' , 'Oregon' ] )
frame
b d e Utah 0 1 2 Ohio 3 4 5 Texas 6 7 8 Oregon 9 10 11
frame. loc[ 'Utah' ]
b 0
d 1
e 2
Name: Utah, dtype: int32
frame. sub( frame. loc[ 'Utah' ] , axis= 1 )
b d e Utah 0 Ohio 3 Texas 6 Oregon 9
frame. apply ( lambda x: x. max ( ) - x. min ( ) )
b 9
d 9
e 9
dtype: int64
frame. apply ( lambda x: x. max ( ) - x. min ( ) , axis= 1 )
Utah 2
Ohio 2
Texas 2
Oregon 2
dtype: int64
frame. applymap( lambda x: '%.2f' % x)
b d e Utah 0.00 1.00 2.00 Ohio 3.00 4.00 5.00 Texas 6.00 7.00 8.00 Oregon 9.00 10.00 11.00
frame. sort_values( by= 'b' , ascending= False )
b d e Oregon 9 10 11 Texas 6 7 8 Ohio 3 4 5 Utah 0 1 2
frame. sort_values( by= 'Utah' , axis= 1 , ascending= False )
e d b Utah 2 1 0 Ohio 5 4 3 Texas 8 7 6 Oregon 11 10 9
frame= pd. DataFrame( { 'b' : [ 5 , 7 , - 3 , 2 ] , 'a' : [ 0 , 1 , 0 , 1 ] , 'c' : [ - 2 , 5 , 8 , - 3 ] } )
frame
frame. rank( axis= 0 )
b a c 0 3.0 1.5 2.0 1 4.0 3.5 3.0 2 1.0 1.5 4.0 3 2.0 3.5 1.0
frame. rank( axis= 1 )
b a c 0 3.0 2.0 1.0 1 3.0 1.0 2.0 2 1.0 2.0 3.0 3 3.0 2.0 1.0
排名是用于破坏平级关系的method选项(如果两个数相等,表明在相同的分组)
method 说明 average 默认:在同组中,为各个值分配平均排名 min 使用整个分组的最小排名 max 使用整个分组的最大排名 first 按值出现的先后顺序
4.6 带着重复值的轴索引
df = DataFrame( np. random. randn( 4 , 3 ) , index= [ 'a' , 'a' , 'b' , 'b' ] )
df
0 1 2 a -0.328041 1.907503 -0.896179 a -0.049003 -0.597808 0.744535 b -1.142489 -0.846847 -0.140037 b -1.154353 -0.947474 -1.460308
df. index. is_unique
False
df. loc[ 'a' ]
0 1 2 a -0.328041 1.907503 -0.896179 a -0.049003 -0.597808 0.744535
df. iloc[ 1 ]
0 -0.049003
1 -0.597808
2 0.744535
Name: a, dtype: float64
4.7汇总和计算描述统计
df = DataFrame( [ [ 1.4 , np. nan] , [ 7.1 , - 4.5 ] , [ np. nan, np. nan] , [ 0.75 , - 1.3 ] ] , index= [ 'a' , 'b' , 'c' , 'd' ] , columns= [ 'one' , 'two' ] )
df
one two a 1.40 NaN b 7.10 -4.5 c NaN d 0.75 -1.3
df. sum ( )
one 9.25
two -5.80
dtype: float64
df. sum ( skipna= False )
one NaN
two NaN
dtype: float64
参数 说明 axis 轴 skipna 是否忽略缺失值,默认为True
描述和汇总统计
函数 说明 count 非NA值得数量 describe 列的计算汇总统计 min,max 最大值,最小值 argmin,argmax 计算能获取到最大值和最小值的索引位置(整数) idxmin,idxmax 计算能获取到最大值和最小值的索引位置 quantile 计算样本的分位数(0到1) sum 和 mean 平均数 median 值得算术中位数 mad 根据平均值计算平均绝对离差 var 方差 std 标注差 skew 偏度(三阶矩) kurt 峰度(四阶矩) cumsum 累计和 cummin,cummax 累计最大值和累计最小值 cumprod 累计积 diff 计算一阶差分(对时间序列有用) pct_change 计算百分数变化
obj = Series( [ 'c' , 'a' , 'd' , 'a' , 'b' , 'b' , 'c' , 'c' ] )
obj
0 c
1 a
2 d
3 a
4 b
5 b
6 c
7 c
dtype: object
obj. unique( )
array(['c', 'a', 'd', 'b'], dtype=object)
obj. value_counts( )
c 3
a 2
b 2
d 1
dtype: int64
obj. isin( [ 'b' , 'c' ] )
0 True
1 False
2 False
3 False
4 True
5 True
6 True
7 True
dtype: bool
5.缺失值处理
NA的处理方法
方法 说明 dropna 根据各标签的值中是否存在缺失数据对轴标签进行过滤,可通过阈值调节对缺失值得容忍度 fillna 用指定值或插值方法(ffill或bfill)填充缺失数据 isnull 返回一个含有布尔值的对象,这些布尔值表示哪些值是缺失值NA notnull isull的否定式
5.1 滤除缺失数据
from numpy import nan as NA
data = DataFrame( [ [ 1 , 6.5 , 3 ] , [ 1 , NA, NA] , [ NA, NA, NA] , [ NA, 6.5 , 3 ] ] )
data
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN 2 NaN 3 NaN 6.5 3.0
data. dropna( )
data. dropna( how= 'all' )
0 1 2 0 1.0 6.5 3.0 1 1.0 NaN 3 NaN 6.5 3.0
5.2 填充数据
df = DataFrame( np. random. randn( 7 , 3 ) )
df. iloc[ : 5 , 1 ] = NA; df. iloc[ : 3 , 2 ] = NA
df
0 1 2 0 1.734150 NaN 1 0.706558 NaN 2 -0.393785 NaN 3 1.317984 NaN -2.853658 4 0.415997 NaN -0.125980 5 -0.154539 0.557347 1.042196 6 0.385543 -0.960659 -0.946296
df. fillna( 0 )
0 1 2 0 0.168608 0.000000 1 0.012919 0.000000 2 1.221231 0.000000 3 -1.515374 0.000000 0.774739 4 0.081386 0.000000 -0.327303 5 -0.193886 1.570973 1.057608 6 0.131576 0.623344 0.392884
df. fillna( { 1 : 0.5 , 2 : - 1 } )
0 1 2 0 0.210955 0.500000 -1.000000 1 0.666389 0.500000 -1.000000 2 0.488486 0.500000 -1.000000 3 2.166515 0.500000 0.736014 4 -0.702573 0.500000 -1.150182 5 0.310545 0.024061 -0.506138 6 0.729156 0.368345 1.721337
_ = df. fillna( { 1 : 0.5 , 2 : - 1 } , inplace= True )
df
0 1 2 0 1.734150 0.500000 -1.000000 1 0.706558 0.500000 -1.000000 2 -0.393785 0.500000 -1.000000 3 1.317984 0.500000 -2.853658 4 0.415997 0.500000 -0.125980 5 -0.154539 0.557347 1.042196 6 0.385543 -0.960659 -0.946296
6.层次化索引
data = Series( np. random. randn( 10 ) , index= [ [ 'a' , 'a' , 'a' , 'b' , 'b' , 'b' , 'c' , 'c' , 'd' , 'd' ] , [ 1 , 2 , 3 , 1 , 2 , 3 , 1 , 2 , 2 , 3 ] ] )
data
a 1 -1.076458
2 1.126883
3 0.628407
b 1 1.704866
2 2.212012
3 -2.006192
c 1 0.153430
2 -1.172675
d 2 -2.626143
3 -2.760771
dtype: float64
data[ 'b' ]
1 1.704866
2 2.212012
3 -2.006192
dtype: float64
data[ 'b' , 2 ]
2.212012376812743
data. unstack( )
1 2 3 a -1.076458 1.126883 0.628407 b 1.704866 2.212012 -2.006192 c 0.153430 -1.172675 NaN d NaN -2.626143 -2.760771
frame = DataFrame( np. arange( 12 ) . reshape( 4 , 3 ) ,
index= [ [ 'a' , 'a' , 'b' , 'b' ] , [ 1 , 2 , 1 , 2 ] ] ,
columns= [ [ 'Ohio' , 'Ohio' , 'Colorado' ] , [ 'Green' , 'Red' , 'Green' ] ] )
frame
Ohio Colorado Green Red Green a 1 0 1 2 2 3 4 5 b 1 6 7 8 2 9 10 11
frame = DataFrame( { 'a' : range ( 7 ) , 'b' : range ( 7 , 0 , - 1 ) ,
'c' : [ 'one' , 'one' , 'one' , 'two' , 'two' , 'two' , 'two' ] ,
'd' : [ 0 , 1 , 2 , 0 , 1 , 2 , 3 ] } )
frame
a b c d 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 3 3 4 two 0 4 4 3 two 1 5 5 2 two 2 6 6 1 two 3
frame2 = frame. set_index( [ 'c' , 'd' ] )
frame2
a b c d one 0 0 7 1 1 6 2 2 5 two 0 3 4 1 4 3 2 5 2 3 6 1
frame2 = frame. set_index( [ 'c' , 'd' ] , drop= False )
frame2
a b c d c d one 0 0 7 one 0 1 1 6 one 1 2 2 5 one 2 two 0 3 4 two 0 1 4 3 two 1 2 5 2 two 2 3 6 1 two 3