0 4
1 7
2 -5
3 3
dtype: int64
RangeIndex(start=0, stop=4, step=1)
d 4
b 7
c -5
a 3
dtype: int64
Index(['d', 'b', 'c', 'a'], dtype='object')
b 7
a 3
c -5
dtype: int64
BOB 4
STEVE 7
JEFF -5
RYAN 3
dtype: int64
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
#更改列值
frame2['debt']=16.5
frame2
frame2['debt']= np.arange(6)
frame2
val = pd.Series([-1.2,-1.5,-1.7], index=['two','four','five'])
frame2['debt']= val #将Series赋值给一列时,其索引将会按照DataFrame的索引重新排列,并在空缺地方填充缺失值
frame2
year
state
pop
debt
one
2000
Ohio
1.5
16.5
two
2001
Ohio
1.7
16.5
three
2002
Ohio
3.6
16.5
four
2001
Nevada
2.4
16.5
five
2002
Nevada
2.9
16.5
six
2003
Nevada
3.2
16.5
year
state
pop
debt
one
2000
Ohio
1.5
0
two
2001
Ohio
1.7
1
three
2002
Ohio
3.6
2
four
2001
Nevada
2.4
3
five
2002
Nevada
2.9
4
six
2003
Nevada
3.2
5
year
state
pop
debt
one
2000
Ohio
1.5
NaN
two
2001
Ohio
1.7
-1.2
three
2002
Ohio
3.6
NaN
four
2001
Nevada
2.4
-1.5
five
2002
Nevada
2.9
-1.7
six
2003
Nevada
3.2
NaN
#删除列
frame2['eastern']=(frame2.state =='Ohio')#创建新列,且值为bool值
frame2
del frame2['eastern']
frame2
d 4.5
b 1.2
a -5.5
c 2.6
dtype: float64
a -5.5
b 1.2
c 2.6
d 4.5
e NaN
dtype: float64
d 4.5
b 1.2
a -5.5
c 2.6
dtype: float64
0 blue
2 yellew
4 red
dtype: object
obj31=
0 blue
1 blue
2 yellew
3 yellew
4 red
5 red
dtype: object
obj32=
0 blue
1 yellew
2 yellew
3 red
4 red
5 NaN
dtype: object
#在DataFrame中,reindex可以改变行索引、列索引,也可以同时改变二者。当仅传入一个序列时,结果中的行会重建索引。
frame = pd.DataFrame(np.arange(9).reshape(3,3), index =['a','c','d'], columns =['Ohio','Texas','California'])
frame
frame2 = frame.reindex(['a','b','c','d'])#重建行索引print('frame2=\n{}'.format(frame2))
states =['Texas','Utah','California']print(frame.reindex(columns=states))#也可使用loc来进行标签索引,但可能引起标签丢失,最好用reindexprint(frame.loc[['a','b','c','d'], states])
Ohio
Texas
California
a
0
1
2
c
3
4
5
d
6
7
8
frame2=
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
Texas Utah California
a 1.0 NaN 2.0
b NaN NaN NaN
c 4.0 NaN 5.0
d 7.0 NaN 8.0
# drop 从轴向上删除
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])
obj.drop(['a','c'])
obj #drop不更改原对象
obj.drop('a', inplace=True)#利用参数inplace,可清除要删除的数据
obj
data = pd.DataFrame(np.arange(16).reshape(4,4), index=['ohio','colo','utah','new'], columns=['one','tow','three','four'])
data
data.drop(['ohio','utah'])#默认删除行
data.drop(['one','three'], axis='columns')#删除列,也可为axis=1
data.drop(['one'], axis=1)
b 1.0
d 3.0
e 4.0
dtype: float64
a 0.0
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
b 1.0
c 2.0
d 3.0
e 4.0
dtype: float64
1.0
1.0
a 0.0
c 2.0
d 3.0
dtype: float64
a 0.0
c 2.0
d 3.0
dtype: float64
b 1.0
c 2.0
dtype: float64
b 1.0
c 2.0
d 3.0
dtype: float64
a 0.0
b 1.0
dtype: float64
#DataFrame的索引与切片
data = pd.DataFrame(np.arange(16).reshape(4,4),
index=['ohio','colo','utah','new'],
columns=['one','two','three','four'])
data
data[['two','three']]#选择列
data[:2]#选择行
data['three']>5
data[data['three']>5]
data[data <5]=0
data
#算术与数据对齐#对象相加时,若存在某个索引对不相同,则返回结果的索引将会是索引对的并集,其值为空。类似于数据库的外连接(outer join)
a = pd.Series(np.arange(2), index=['a','b'])
b = pd.Series(np.arange(2)+1, index=['b','c'])print('a=\n{}'.format(a))print('b=\n{}'.format(b))print('a+b=\n{}'.format(a+b))
C = pd.DataFrame(np.arange(9).reshape(3,3), index=['a','b','c'], columns=['one','two','three'])
D = pd.DataFrame(np.ones((3,3)), index=['a','b','d'], columns=['two','three','four'])print('C=\n{}'.format(C))print('D=\n{}'.format(D))print('C+D=\n{}'.format(C+D))
C.add(D, fill_value=0)
D.add(C, fill_value=0)#两者相同,返回的索引结果为两者并集,表格内数据对应项相加,非对应项取有值的,两者都为空取空
a=
a 0
b 1
dtype: int32
b=
b 1
c 2
dtype: int32
a+b=
a NaN
b 2.0
c NaN
dtype: float64
C=
one two three
a 0 1 2
b 3 4 5
c 6 7 8
D=
two three four
a 1.0 1.0 1.0
b 1.0 1.0 1.0
d 1.0 1.0 1.0
C+D=
four one three two
a NaN NaN 3.0 2.0
b NaN NaN 6.0 5.0
c NaN NaN NaN NaN
d NaN NaN NaN NaN
one 9.25
two -5.80
dtype: float64
a 1.40
b 2.60
c 0.00
d -0.55
dtype: float64
a 1.400
b 1.300
c NaN
d -0.275
dtype: float64
a NaN
b 1.300
c NaN
d -0.275
dtype: float64
one d
two b
dtype: object
a one
b one
c NaN
d one
dtype: object
one
two
a
1.40
NaN
b
8.50
-4.5
c
NaN
NaN
d
9.25
-5.8
one
two
count
3.000000
2.000000
mean
3.083333
-2.900000
std
3.493685
2.262742
min
0.750000
-4.500000
25%
1.075000
-3.700000
50%
1.400000
-2.900000
75%
4.250000
-2.100000
max
7.100000
-1.300000
0 a
1 a
2 b
3 c
4 a
5 a
6 b
7 c
8 a
9 a
10 b
11 c
12 a
13 a
14 b
15 c
dtype: object
count 16
unique 3
top a
freq 8
dtype: object
# 更新pip:管理员运行cmd,输入python -m pip install -U --force-reinstall pip 回车# 安装pandas-datareader库# 进入anacoda目录,输入conda install -c anaconda pandas-datareader 回车import pandas_datareader.data as web
#利用pandas_datareader模块下载一些股票行情的数据
all_data ={ticker:web.get_data_yahoo(ticker)for ticker in['AAPL','IBM','MSFT','GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']}for ticker, data in all_data.items())
# 计算DataFrame多个相关列的直方图
data = pd.DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
data
result = data.apply(pd.value_counts).fillna(0)
result #行标签为列中出现的不同值, 数值为这些不同值在列中出现的次数