python数据分析之-pandas基础

本文主要介绍了Python中用于数据分析的重要库——pandas的基础知识。通过学习,读者将掌握如何创建、操作和管理DataFrame对象,包括数据清洗、数据合并及简单的统计分析。pandas的高效特性和易用性使得它成为数据科学家的首选工具。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
#pandas基础

import pandas as pd
from pandas import Series, DataFrame

import numpy as np
#Series数组的创建与索引

obj = pd.Series([4, 7, -5, 3])    #不设置索引时,默认数字,且从0开始
obj
obj.index
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'c', 'a'])
obj2
obj2.index

obj2[['b', 'a', 'c']]    #利用index来索引

obj.index = ['BOB', 'STEVE', 'JEFF', 'RYAN']    #更改Series的索引
obj
0    4
1    7
2   -5
3    3
dtype: int64

RangeIndex(start=0, stop=4, step=1)

d    4
b    7
c   -5
a    3
dtype: int64

Index(['d', 'b', 'c', 'a'], dtype='object')

b    7
a    3
c   -5
dtype: int64

BOB      4
STEVE    7
JEFF    -5
RYAN     3
dtype: int64
#用bool值过滤、与标量相乘、应用数学函数等都保留索引值连接

obj2[obj2 > 0]
obj2 * 2
np.exp(obj2)
d    4
b    7
a    3
dtype: int64

d     8
b    14
c   -10
a     6
dtype: int64

d      54.598150
b    1096.633158
c       0.006738
a      20.085537
dtype: float64
#Series数组与字典(可认为Series是一个长度固定且有序的字典)

'b' in obj2
'e' in obj2

#将字典转为Series

sdata = {'A':3500, 'B':4500, 'C':2300, 'D':1300}
obj3 = pd.Series(sdata)
obj3

states = ['C', 'D', 'E', 'A']
obj4 = pd.Series(sdata, index=states)       #可自定索引将字典生成Series数组,不包含的索引对应值为NaN
obj4
True

False

A    3500
B    4500
C    2300
D    1300
dtype: int64

C    2300.0
D    1300.0
E       NaN
A    3500.0
dtype: float64
# isnull() notnull()检查缺失数据

pd.isnull(obj4)    #为空返回True
pd.notnull(obj4)   #不为空返回True
obj4.isnull()

pd.isnull(obj4).sum()  #空值个数

C    False
D    False
E     True
A    False
dtype: bool

C     True
D     True
E    False
A     True
dtype: bool

C    False
D    False
E     True
A    False
dtype: bool

1
#自动对齐索引属性

obj3
obj4
obj3 + obj4  #相同索引对应值相加,不同索引值无法处理返回NaN
A    3500
B    4500
C    2300
D    1300
dtype: int64

C    2300.0
D    1300.0
E       NaN
A    3500.0
dtype: float64


A    7000.0
B       NaN
C    4600.0
D    2600.0
E       NaN
dtype: float64
#对Series数组命名,对其索引命名(name属性)

obj4.name = 'population'
obj4.index.name = 'state'    #值不能命名
obj4
state
C    2300.0
D    1300.0
E       NaN
A    3500.0
Name: population, dtype: float64
# DataFrame的创建

data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year':[2000, 2001, 2002, 2001, 2002, 2003],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}                      #利用包含等长度列表或numpy数组的字典来创建DataFrame
frame = pd.DataFrame(data)
frame

frame.head()  #对于大型表格,用head()只选取前五行
stateyearpop
0Ohio20001.5
1Ohio20011.7
2Ohio20023.6
3Nevada20012.4
4Nevada20022.9
5Nevada20033.2
stateyearpop
0Ohio20001.5
1Ohio20011.7
2Ohio20023.6
3Nevada20012.4
4Nevada20022.9
# DataFrame的索引

frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame2       #可指定列的顺序,更改索引名,若所指定的列不存在,返回NaN值

frame2.year
frame2['year']    #选取列,这两种方式等价

frame2.loc['three']   #选取行
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7NaN
three2002Ohio3.6NaN
four2001Nevada2.4NaN
five2002Nevada2.9NaN
six2003Nevada3.2NaN
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64


one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64


year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
#更改列值

frame2['debt'] = 16.5   
frame2

frame2['debt'] = np.arange(6)
frame2

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val           #将Series赋值给一列时,其索引将会按照DataFrame的索引重新排列,并在空缺地方填充缺失值
frame2
yearstatepopdebt
one2000Ohio1.516.5
two2001Ohio1.716.5
three2002Ohio3.616.5
four2001Nevada2.416.5
five2002Nevada2.916.5
six2003Nevada3.216.5
yearstatepopdebt
one2000Ohio1.50
two2001Ohio1.71
three2002Ohio3.62
four2001Nevada2.43
five2002Nevada2.94
six2003Nevada3.25
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
six2003Nevada3.2NaN
#删除列

frame2['eastern'] = (frame2.state == 'Ohio')   #创建新列,且值为bool值
frame2

del frame2['eastern']
frame2
yearstatepopdebteastern
one2000Ohio1.5NaNTrue
two2001Ohio1.7-1.2True
three2002Ohio3.6NaNTrue
four2001Nevada2.4-1.5False
five2002Nevada2.9-1.7False
six2003Nevada3.2NaNFalse
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
six2003Nevada3.2NaN
#DataFrame的创建2

pop = {'Nevada': {2001:2.4, 2002:2.9},
       'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}         #包含字典的嵌套字典,嵌套字典的索引作为DataFrame行索引
frame3 = pd.DataFrame(pop)
frame3

frame3.T     #转置,调换行列

pd.DataFrame(pop, index=[2001, 2002, 2003])      #可指定行索引

pdata = {'Ohio':frame3['Ohio'][:-1],
        'Nevada':frame3['Nevada'][:2]}         #包含Series的字典也可以用于构造DataFrame
pd.DataFrame(pdata)
NevadaOhio
2000NaN1.5
20012.41.7
20022.93.6
200020012002
NevadaNaN2.42.9
Ohio1.51.73.6
NevadaOhio
20012.41.7
20022.93.6
2003NaNNaN
OhioNevada
20001.5NaN
20011.72.4
#对DataFrame的索引和列命名(name属性)

frame3.index.name = 'year'
frame3.columns.name = 'state'
frame3

stateNevadaOhio
year
2000NaN1.5
20012.41.7
20022.93.6
#索引对象

obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]
#index[1] = 'd' 报错。不能修改。索引对象是不可变的。

labels = pd.Index(np.arange(3))    #创建索引
labels

obj2 = pd.Series([-1, -2, -4], index=labels)
obj2
obj2.index is labels

Index(['a', 'b', 'c'], dtype='object')






Index(['b', 'c'], dtype='object')






Int64Index([0, 1, 2], dtype='int64')






0   -1
1   -2
2   -4
dtype: int64






True
# reindex 对索引重排

obj = pd.Series([4.5, 1.2, -5.5, 2.6], index=['d', 'b', 'a', 'c'])
obj
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
obj   #未改变obj

#可选参数:
#index
#method='ffill' 前向填充;method='bfill'后向填充
#fill_value 通过重新索引引入缺失数据时使用的替代值
#limit 前向或后向填充时,所需填充的最大尺寸间隙
#tolerance 前向或后向填充时,所需填充的不精确匹配下的最大尺寸间隙
#level
#copy


obj3 = pd.Series(['blue', 'yellew', 'red'], index = [0, 2, 4])
obj3
obj31 = obj3.reindex(range(6), method = 'ffill')
print('obj31=\n{}'.format(obj31))
obj32 = obj3.reindex(range(6), method = 'bfill')
print('obj32=\n{}'.format(obj32))

d    4.5
b    1.2
a   -5.5
c    2.6
dtype: float64






a   -5.5
b    1.2
c    2.6
d    4.5
e    NaN
dtype: float64






d    4.5
b    1.2
a   -5.5
c    2.6
dtype: float64






0      blue
2    yellew
4       red
dtype: object



obj31=
0      blue
1      blue
2    yellew
3    yellew
4       red
5       red
dtype: object
obj32=
0      blue
1    yellew
2    yellew
3       red
4       red
5       NaN
dtype: object
#在DataFrame中,reindex可以改变行索引、列索引,也可以同时改变二者。当仅传入一个序列时,结果中的行会重建索引。

frame = pd.DataFrame(np.arange(9).reshape(3,3), index = ['a', 'c', 'd'], columns = ['Ohio', 'Texas', 'California'])
frame

frame2 = frame.reindex(['a', 'b', 'c', 'd'])  #重建行索引
print('frame2=\n{}'.format(frame2))

states = ['Texas', 'Utah', 'California']
print(frame.reindex(columns=states))

#也可使用loc来进行标签索引,但可能引起标签丢失,最好用reindex
print(frame.loc[['a', 'b', 'c', 'd'], states])
OhioTexasCalifornia
a012
c345
d678
frame2=
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8
   Texas  Utah  California
a    1.0   NaN         2.0
b    NaN   NaN         NaN
c    4.0   NaN         5.0
d    7.0   NaN         8.0
# drop 从轴向上删除

obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj.drop(['a', 'c'])
obj    #drop不更改原对象
obj.drop('a', inplace=True)  #利用参数inplace,可清除要删除的数据
obj

data = pd.DataFrame(np.arange(16).reshape(4,4), index=['ohio', 'colo', 'utah', 'new'], columns=['one', 'tow', 'three', 'four'])
data
data.drop(['ohio', 'utah'])  #默认删除行
data.drop(['one', 'three'], axis='columns')  #删除列,也可为axis=1
data.drop(['one'], axis=1)
b    1.0
d    3.0
e    4.0
dtype: float64






a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64






b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
onetowthreefour
ohio0123
colo4567
utah891011
new12131415
onetowthreefour
colo4567
new12131415
towfour
ohio13
colo57
utah911
new1315
towthreefour
ohio123
colo567
utah91011
new131415
#Series的索引与切片

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

obj['b']
obj[1]

obj[['a', 'c', 'd']]
obj[[0, 2, 3]]

obj[1:3]
obj['b':'d']   #用给定的索引切片时,保留尾部

obj[obj<2]
1.0






1.0






a    0.0
c    2.0
d    3.0
dtype: float64






a    0.0
c    2.0
d    3.0
dtype: float64






b    1.0
c    2.0
dtype: float64






b    1.0
c    2.0
d    3.0
dtype: float64






a    0.0
b    1.0
dtype: float64
#DataFrame的索引与切片

data = pd.DataFrame(np.arange(16).reshape(4,4),
                   index=['ohio', 'colo', 'utah', 'new'],
                   columns=['one', 'two', 'three', 'four'])
data

data[['two', 'three']]    #选择列
data[:2]     #选择行

data['three']>5
data[data['three']>5]

data[data < 5] = 0
data
onetwothreefour
ohio0123
colo4567
utah891011
new12131415
twothree
ohio12
colo56
utah910
new1314
onetwothreefour
ohio0123
colo4567
ohio    False
colo     True
utah     True
new      True
Name: three, dtype: bool
onetwothreefour
colo4567
utah891011
new12131415
onetwothreefour
ohio0000
colo0567
utah891011
new12131415
#轴标签loc,整数标签iloc

data

data.loc['colo', ['two', 'three']]
data.iloc[1, [1, 2]]

data.iloc[1:3, 2:4]
data.loc['colo':'utah', 'three':'four']  #保留尾部

data.iloc[:, :3][data.three>5]

data[data.three>7]


#注意loc和iloc区别

ser = pd.Series(np.arange(3.))
'\nser=\n{}'.format(ser)
print('\nser[:1]=\n{}'.format(ser[:1]))
print('\nser.loc[:1]=\n{}'.format(ser.loc[:1]))
print('\nser.iloc[:1]=\n{}'.format(ser.iloc[:1]))
onetwothreefour
ohio0000
colo0567
utah891011
new12131415
two      5
three    6
Name: colo, dtype: int32






two      5
three    6
Name: colo, dtype: int32
threefour
colo67
utah1011
threefour
colo67
utah1011
onetwothree
colo056
utah8910
new121314
onetwothreefour
utah891011
new12131415
'\nser=\n0    0.0\n1    1.0\n2    2.0\ndtype: float64'




ser[:1]=
0    0.0
dtype: float64

ser.loc[:1]=
0    0.0
1    1.0
dtype: float64

ser.iloc[:1]=
0    0.0
dtype: float64
#算术与数据对齐
#对象相加时,若存在某个索引对不相同,则返回结果的索引将会是索引对的并集,其值为空。类似于数据库的外连接(outer join)

a = pd.Series(np.arange(2), index=['a', 'b'])
b = pd.Series(np.arange(2)+1, index=['b', 'c'])
print('a=\n{}'.format(a))
print('b=\n{}'.format(b))
print('a+b=\n{}'.format(a+b))

C = pd.DataFrame(np.arange(9).reshape(3,3), index=['a', 'b', 'c'], columns=['one', 'two', 'three'])
D = pd.DataFrame(np.ones((3,3)), index=['a', 'b', 'd'], columns=['two', 'three', 'four'])
print('C=\n{}'.format(C))
print('D=\n{}'.format(D))
print('C+D=\n{}'.format(C+D))

C.add(D, fill_value=0)
D.add(C, fill_value=0)   #两者相同,返回的索引结果为两者并集,表格内数据对应项相加,非对应项取有值的,两者都为空取空
a=
a    0
b    1
dtype: int32
b=
b    1
c    2
dtype: int32
a+b=
a    NaN
b    2.0
c    NaN
dtype: float64
C=
   one  two  three
a    0    1      2
b    3    4      5
c    6    7      8
D=
   two  three  four
a  1.0    1.0   1.0
b  1.0    1.0   1.0
d  1.0    1.0   1.0
C+D=
   four  one  three  two
a   NaN  NaN    3.0  2.0
b   NaN  NaN    6.0  5.0
c   NaN  NaN    NaN  NaN
d   NaN  NaN    NaN  NaN
fouronethreetwo
a1.00.03.02.0
b1.03.06.05.0
cNaN6.08.07.0
d1.0NaN1.01.0
fouronethreetwo
a1.00.03.02.0
b1.03.06.05.0
cNaN6.08.07.0
d1.0NaN1.01.0
#算术

df = pd.DataFrame(np.arange(6).reshape(2,3), columns=['a', 'b', 'c'])
df.div(1)   #df/1
df.rdiv(1)  #1/df

#加法 add, radd
#减法 sub, rsub
#除法 div, rdiv
#整除 floordiv, rfloordiv
#乘法 mul, rmul
#幂次方 pow, rpow
abc
00.01.02.0
13.04.05.0
abc
0inf1.000.5
10.3333330.250.2
#DataFrame与Series间的操作
#默认情况下,两者的数学操作中会将Series的索引和DataFrame的列进行匹配,并广播到各行

frame = pd.DataFrame(np.arange(12.).reshape(4,3), columns=list('abc'), index=['utah', 'ohio', 'texas', 'oregon'])
series = frame.iloc[0]  #frame的第一行

frame
series
frame-series

#若索引值不在DataFrame的列中,不在Series的索引中,则对象会重建索引并形成联合

series2 = pd.Series(range(3), index=['b', 'c', 'd'])
frame + series2

#若在列上广播,在行上匹配,需要使用算数方法
frame
series3 = frame['b']  #frame的'b'列
frame.add(series3, axis='index')  #将series加到frame的每一列上
frame.sub(series3, axis='index')  #对frame每一列减去series3
abc
utah0.01.02.0
ohio3.04.05.0
texas6.07.08.0
oregon9.010.011.0
a    0.0
b    1.0
c    2.0
Name: utah, dtype: float64
abc
utah0.00.00.0
ohio3.03.03.0
texas6.06.06.0
oregon9.09.09.0
abcd
utahNaN1.03.0NaN
ohioNaN4.06.0NaN
texasNaN7.09.0NaN
oregonNaN10.012.0NaN
abc
utah0.01.02.0
ohio3.04.05.0
texas6.07.08.0
oregon9.010.011.0
abc
utah1.02.03.0
ohio7.08.09.0
texas13.014.015.0
oregon19.020.021.0
abc
utah-1.00.01.0
ohio-1.00.01.0
texas-1.00.01.0
oregon-1.00.01.0
# 函数应用和映射

frame = pd.DataFrame(np.random.randn(4,3), index=['utah', 'ohio', 'texas', 'oregon'], columns=list('bde'))
frame
np.abs(frame)

f = lambda x: x.max() - x.min()     #自定义函数
frame.apply(f)  #默认将每一列作为x,并返回f,计算每列的最大最小差值
frame.apply(f, axis='columns')    #将每一行作为x,并返回f,计算每行的最大最小差值

def f(x):
    return pd.Series([x.min(), x.max(), x.max()-x.min()], index=['min', 'max', 'max-min'])   #也可以返回带有多个值的Series
frame.apply(f)         #frame的每一列作为x代入f(x),并返回一个给定标签的Series,故最终返回表

#逐元素映射
frame
fadd = lambda x: x+1
frame.applymap(fadd)

#根据frame中的每个浮点数计算一个格式化字符串
format = lambda x: '%.2f' % x
frame.applymap(format)

frame['e'].map(format)   #在Series中用map()逐元素处理

bde
utah-0.192940-0.2199792.046093
ohio-0.1928680.155119-0.077429
texas1.9666070.804258-0.668518
oregon-0.8537400.549948-0.835668
bde
utah0.1929400.2199792.046093
ohio0.1928680.1551190.077429
texas1.9666070.8042580.668518
oregon0.8537400.5499480.835668
b    2.820348
d    1.024237
e    2.881761
dtype: float64






utah      2.266071
ohio      0.347988
texas     2.635125
oregon    1.403688
dtype: float64
bde
min-0.853740-0.219979-0.835668
max1.9666070.8042582.046093
max-min2.8203481.0242372.881761
bde
utah-0.192940-0.2199792.046093
ohio-0.1928680.155119-0.077429
texas1.9666070.804258-0.668518
oregon-0.8537400.549948-0.835668
bde
utah0.8070600.7800213.046093
ohio0.8071321.1551190.922571
texas2.9666071.8042580.331482
oregon0.1462601.5499480.164332
bde
utah-0.19-0.222.05
ohio-0.190.16-0.08
texas1.970.80-0.67
oregon-0.850.55-0.84
utah       2.05
ohio      -0.08
texas     -0.67
oregon    -0.84
Name: e, dtype: object
#sort_index(), sort_values() 排列

obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj
obj.sort_index()

frame = pd.DataFrame(np.arange(8).reshape((2,4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame
frame.sort_index()   #默认对行索引排序
frame.sort_index(axis=1) 
frame.sort_index(axis=1, ascending=False)   #ascengding=False 降序

obj = pd.Series([4, np.nan, -7, np.nan, -3, 2])
obj.sort_values()    #对值排序,且默认缺失值放在尾部

frame = pd.DataFrame({'b':[4, 7, -3, 2], 'a':[1 ,0, 1, 0]})
frame
frame.sort_values(by=['a', 'b']) #对某列或多列值排序
d    0
a    1
b    2
c    3
dtype: int64






a    1
b    2
c    3
d    0
dtype: int64
dabc
three0123
one4567
dabc
one4567
three0123
abcd
three1230
one5674
dcba
three0321
one4765
2   -7.0
4   -3.0
5    2.0
0    4.0
1    NaN
3    NaN
dtype: float64
ba
041
170
2-31
320
ba
320
170
2-31
041
#排名 rank()

obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

obj.rank()   #默认按升序排名(即值最小排第一),且每个组中分配平均排名(一个组中成绩相同)
obj.rank(method='min')  #对每个组使用最小排名
obj.rank(method='max')  #对每个组使用最大排名
obj.rank(method='first')  #按照值在数据中出现的次序分配排名
obj.rank(method='dense')  #类似method='min',但组间排名总是加1(组间排名连贯)

obj.rank(ascending=False, method='min')  #按降序排名(即最大值排第一)

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1], 'c':[-2, 5, 8, -2.5]})
frame
frame.rank(axis=1)  #对每行的值进行排名

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64






0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64






0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64






0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64






0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64






0    5.0
1    1.0
2    5.0
3    4.0
4    3.0
5    2.0
6    4.0
dtype: float64






0    1.0
1    7.0
2    1.0
3    3.0
4    5.0
5    6.0
6    3.0
dtype: float64
bac
04.30-2.0
17.015.0
2-3.008.0
32.01-2.5
bac
03.02.01.0
13.01.02.0
21.02.03.0
33.02.01.0
#含有重复索引的轴标签

obj = pd.Series(np.arange(4), index=list('aabb'))
obj

obj.index.is_unique  #判断索引是否唯一

obj['a']

frame = pd.DataFrame(np.arange(12).reshape(4,3), index=list('aabb'), columns=['bob', 'mom', 'bob'])
frame
frame['bob']
frame.loc['a']
a    0
a    1
b    2
b    3
dtype: int32






False






a    0
a    1
dtype: int32
bobmombob
a012
a345
b678
b91011
bobbob
a02
a35
b68
b911
bobmombob
a012
a345
#描述性统计的概述与计算

df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df

df.sum()  #列上求和
df.sum(axis='columns')  #行上求和

df.mean(axis='columns')
df.mean(axis='columns', skipna=False)  #除非全为nan,否则会忽略NA来计算。利用skipna=False来不排除NA值

df.idxmin()  #返回列上最小值的索引值
df.idxmax(axis=1)  #返回行上最大值的索引值

df.cumsum()  #逐行累积求和,即第k行的值=原表k+(k-1)+...+1行的值

df.describe()  #返回统计汇总
#count:非NA值的个数
#std 样本标准差
#25%、50%、75% 分位数

obj = pd.Series(['a', 'a', 'b', 'c']*4)
obj
obj.describe()   #对于非数值型数据,产生不同统计汇总结果

onetwo
a1.40NaN
b7.10-4.5
cNaNNaN
d0.75-1.3
one    9.25
two   -5.80
dtype: float64






a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64






a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64






a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64






one    d
two    b
dtype: object






a    one
b    one
c    NaN
d    one
dtype: object
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%1.075000-3.700000
50%1.400000-2.900000
75%4.250000-2.100000
max7.100000-1.300000
0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object






count     16
unique     3
top        a
freq       8
dtype: object
# 更新pip:管理员运行cmd,输入python -m pip install -U --force-reinstall pip 回车
# 安装pandas-datareader库
# 进入anacoda目录,输入conda install -c anaconda pandas-datareader 回车


import pandas_datareader.data as web
#利用pandas_datareader模块下载一些股票行情的数据

all_data = {ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']} for ticker, data in all_data.items())

returns = price.pct_change()  #计算股价百分比
returns.tail()   #类似head(),默认取最后5行
AAPLIBMMSFTGOOG
Date
2019-11-21-0.0044830.004805-0.000936-0.001305
2019-11-22-0.0008780.0037360.000736-0.004618
2019-11-250.0175340.0121330.0109630.008762
2019-11-26-0.007809-0.0064720.0052900.005250
2019-11-270.006962-0.0017770.001085-0.000693
# A.corr(B) 计算SeriesA和B中重叠的、非NA的、按索引对齐的值的相关性
# A.cov(B) 计算协方差

returns['MSFT'].corr(returns['IBM'])
returns.MSFT.corr(returns.IBM)      #两种表达方式相同,第二种更简洁

returns['MSFT'].cov(returns['IBM'])
returns.MSFT.cov(returns.IBM)

returns.corr()  #相关系数矩阵
returns.cov()   #协方差矩阵

0.48735169014084734






0.48735169014084734






9.401947504905966e-05






9.401947504905966e-05
AAPLIBMMSFTGOOG
AAPL1.0000000.4068730.5721870.524064
IBM0.4068731.0000000.4873520.414445
MSFT0.5721870.4873521.0000000.658972
GOOG0.5240640.4144450.6589721.000000
AAPLIBMMSFTGOOG
AAPL0.0002470.0000830.0001330.000125
IBM0.0000830.0001700.0000940.000082
MSFT0.0001330.0000940.0002180.000148
GOOG0.0001250.0000820.0001480.000231
# A.corrwith(B) 可计算出DataFrame A中的行或列与另一个序列或DataFrame B的相关性

returns.corrwith(returns.IBM)   #B为Series时,返回一个含有为每列计算相关性值的Series

returns.corrwith(volume)   #B为DataFrame时,计算匹配到列名的相关性数值,这里计算出交易量百分比变化的相关性
AAPL    0.406873
IBM     1.000000
MSFT    0.487352
GOOG    0.414445
dtype: float64






AAPL   NaN
GOOG   NaN
IBM    NaN
MSFT   NaN
dtype: float64
# 数据去重

obj = pd.Series(['c', 'a', 'd', 'a', 'b', 'b', 'c', 'c'])

uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
# 分组计数

obj.value_counts()

pd.value_counts(obj.values, sort=False)
c    3
b    2
a    2
d    1
dtype: int64






a    2
c    3
b    2
d    1
dtype: int64
# isin 执行向量化的成员属性检查

obj
mask = obj.isin(['b','c'])
mask
obj[mask]
0    c
1    a
2    d
3    a
4    b
5    b
6    c
7    c
dtype: object






0     True
1    False
2    False
3    False
4     True
5     True
6     True
7     True
dtype: bool






0    c
4    b
5    b
6    c
7    c
dtype: object
# pd.Index(A).get_indexer(B) 返回一个索引数组,该索引数组为B每个元素在A中的索引值

to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_values = pd.Series(['c', 'b', 'a'])

pd.Index(unique_values).get_indexer(to_match)
array([0, 2, 1, 1, 0, 2], dtype=int64)
# 计算DataFrame多个相关列的直方图

data = pd.DataFrame({'Qu1':[1, 3, 4, 3, 4], 'Qu2':[2, 3, 1, 2, 3], 'Qu3':[1, 5, 2, 4, 4]})
data

result = data.apply(pd.value_counts).fillna(0)
result  #行标签为列中出现的不同值, 数值为这些不同值在列中出现的次数
Qu1Qu2Qu3
0121
1335
2412
3324
4434
Qu1Qu2Qu3
11.01.01.0
20.02.01.0
32.02.00.0
42.00.02.0
50.00.01.0

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值