pandas

最新推荐文章于 2025-04-06 17:49:42 发布

dapengzhao

最新推荐文章于 2025-04-06 17:49:42 发布

阅读量148

点赞数

分类专栏：机器学习文章标签： pandas concat merge Series DataFrame

本文链接：https://blog.youkuaiyun.com/qq_37107304/article/details/86699480

版权

机器学习专栏收录该内容

2 篇文章

订阅专栏

pandas

如果用Python中的列表和字典来作比较，那么可以说Numpy是列表形式，没有数值标签，而Pandas就是字典形式，Pandas是基于numpy构建的，让Numpy为中心的应用变的更加简单

pandas有两种数据结构Series，DataFrame

import pandas as pd
import numpy as np

#创建一个Series
s = pd.Series([1,2,3,np.nan,4,5])
print(s)
"""
0    1.0
1    2.0
2    3.0
3    NaN
4    4.0
5    5.0
dtype: float64
"""
#Series的字符串表现形式为：索引在左边，值在右边，由于我们没有为数值指定索引，于是会自动创建一个0到N-1（N为长度）的主整数型索引

#创建一个DataFrance
datas = pd.date_range('2019',periods=4)
df = pd.DataFrame(np.random.randn(4,4),index=datas,columns=['a','b','c','d'])
print(df)
"""
                   a         b        c         d
2019-01-01  0.691600  0.116700 -1.02010 -1.327178
2019-01-02  1.499131  1.354915 -0.72470 -1.078789
2019-01-03  0.320457 -1.575094  0.96635 -0.366165
2019-01-04  1.306082 -0.757091  0.11261  0.179203
"""
#DataFrame是一个表格型的数据结构，它包含一组有序的列，每列可以是不同的值类型（数值，字符串，布尔值等）DataFrame既有行索引也有列索引，它可以被看做是Series组成的大字典

DataFrame的一些简单运用

#挑选df中a标签的数据
print(df['a'])
"""
2019-01-01    0.197712
2019-01-02   -0.227977
2019-01-03   -0.459247
2019-01-04    0.036646
Freq: D, Name: a, dtype: float64
"""

#创建一组没有给定行标签和列标签的数据df1
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
"""
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
"""
#它会默认从0开始索引

#分别指定每一列数据的创建方式
df2 = pd.DataFrame({'a':1,'b':pd.Timestamp('2019'),'c':np.array([3]*4),'d':['ss','ll','ss','ll']})
print(df2)
"""
   a          b  c   d
0  1 2019-01-01  3  ss
1  1 2019-01-01  3  ll
2  1 2019-01-01  3  ss
3  1 2019-01-01  3  ll
"""

#查看数据中的类型
print(df2.dtypes)
"""
a             int64
b    datetime64[ns]
c             int32
d            object
dtype: object
"""

#查看每种数据的名称
print(df2.columns)
"""
Index(['a', 'b', 'c', 'd'], dtype='object')
"""

#查看所有df2的值
print(df2.values)
"""
[[1 Timestamp('2019-01-01 00:00:00') 3 'ss']
 [1 Timestamp('2019-01-01 00:00:00') 3 'll']
 [1 Timestamp('2019-01-01 00:00:00') 3 'ss']
 [1 Timestamp('2019-01-01 00:00:00') 3 'll']]
"""

#查看数据的总结
print(df2.describe())
"""
         a    c
count  4.0  4.0
mean   1.0  3.0
std    0.0  0.0
min    1.0  3.0
25%    1.0  3.0
50%    1.0  3.0
75%    1.0  3.0
max    1.0  3.0
"""

#反转数据
print(df2.T)
"""
                     0         ...                             3
a                    1         ...                             1
b  2019-01-01 00:00:00         ...           2019-01-01 00:00:00
c                    3         ...                             3
d                   ss         ...                            ll

[4 rows x 4 columns]
"""

#对数据的列标签进行排序，并且降序
print(df2.sort_index(axis=1,ascending=False))
"""
    d  c          b  a
0  ss  3 2019-01-01  1
1  ll  3 2019-01-01  1
2  ss  3 2019-01-01  1
3  ll  3 2019-01-01  1
"""

pandas选择数据

datas = pd.date_range('2019',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=datas,columns=['A','B','C','D'])
print(df)
"""
             A   B   C   D
2019-01-01   0   1   2   3
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23
"""

#选择df中某一列的数据有两种方式
print(df.A)
print(df['A'])
"""
2019-01-01     0
2019-01-02     4
2019-01-03     8
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: A, dtype: int32
2019-01-01     0
2019-01-02     4
2019-01-03     8
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: A, dtype: int32
"""

#切片
print(df[0:3])#索引3获取不到，前闭后开df[3:3]将会是一个空对象
print(df['2019-01-01':'2019-01-03'])#都能获取到
"""
            A  B   C   D
2019-01-01  0  1   2   3
2019-01-02  4  5   6   7
2019-01-03  8  9  10  11
            A  B   C   D
2019-01-01  0  1   2   3
2019-01-02  4  5   6   7
2019-01-03  8  9  10  11
"""

#我们可以根据使用标签来选择数据loc，本例子主要通过标签名字选择一行数据，或者通过选择某行或者所有行然后选择其中某一列或者几列数据
print(df.loc['2019-01-01'])
"""
A    0
B    1
C    2
D    3
Name: 2019-01-01 00:00:00, dtype: int32
"""
print(df.loc[:,['A','B']])
"""
             A   B
2019-01-01   0   1
2019-01-02   4   5
2019-01-03   8   9
2019-01-04  12  13
2019-01-05  16  17
2019-01-06  20  21
"""
print(df.loc['2019-01-01',['A','B']])
"""
A    0
B    1
Name: 2019-01-01 00:00:00, dtype: int32
"""

#根据序列iloc
#另外我们可以采用位置进行选择iloc，我们可以通过位置选择在不同情况下所需要的数据例如选择某一个，连续选择或者跨行选等
print(df.iloc[3,1])
"""
13
"""
print(df.iloc[3:5,1:3])
"""
             B   C
2019-01-04  13  14
2019-01-05  17  18
"""
print(df.iloc[[1,3,5],1:3])
"""
             B   C
2019-01-02   5   6
2019-01-04  13  14
2019-01-06  21  22
"""

#根据混合的这两种方式的ix
#选择'A','C'两列，并选择前三行
print(df.ix[:3,['A','C']])
"""
            A   C
2019-01-01  0   2
2019-01-02  4   6
2019-01-03  8  10
"""

#可以通过判断指令进行选择，我们可以通过某约束条件然后选择出数据
print(df[df.A>8])
"""
             A   B   C   D
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23
"""

print(df.A[df.A>8])
"""
2019-01-04    12
2019-01-05    16
2019-01-06    20
Freq: D, Name: A, dtype: int32
"""

pandas设置值

#我们可以根据自己的需求用pandas进行更改数据里面的值或者加上一些空值，或者有值的列
#首先创建一个矩阵数组
datas = pd.date_range('2019',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=datas,columns=['A','B','C','D'])
print(df)
"""
             A   B   C   D
2019-01-01   0   1   2   3
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
2019-01-05  16  17  18  19
2019-01-06  20  21  22  23
"""

#根据loc和iloc来设置值
df.iloc[1,1]=111
df.loc['2019-01-01','B'] = 222
print(df)
"""
             A    B   C   D
2019-01-01   0  222   2   3
2019-01-02   4  111   6   7
2019-01-03   8    9  10  11
2019-01-04  12   13  14  15
2019-01-05  16   17  18  19
2019-01-06  20   21  22  23
"""
#根据条件来设置值，如果要修改B中的值，但是要取决于A
df.B[df.A>12]=0
print(df)
"""
             A   B   C   D
2019-01-01   0   1   2   3
2019-01-02   4   5   6   7
2019-01-03   8   9  10  11
2019-01-04  12  13  14  15
2019-01-05  16   0  18  19
2019-01-06  20   0  22  23
"""

#添加一行数据
df['E'] = np.nan
df['F'] = pd.DataFrame(np.arange(6).reshape((6,1)),index=datas)
df['G'] = pd.Series([1,2,3,4,5,6],index=pd.date_range('2019',periods=6))
print(df)
"""
             A   B   C   D   E  F  G
2019-01-01   0   1   2   3 NaN  0  1
2019-01-02   4   5   6   7 NaN  1  2
2019-01-03   8   9  10  11 NaN  2  3
2019-01-04  12  13  14  15 NaN  3  4
2019-01-05  16  17  18  19 NaN  4  5
2019-01-06  20  21  22  23 NaN  5  6
"""

pandas处理丢失数据

#有时候我们在处理数据的时候会发现有一些数据是空或者是NaN，下面就是处理这些数据的基本方法
#创建一个带有nan的矩阵
datas = pd.date_range('2019',periods=4)
df = pd.DataFrame(np.arange(16).reshape((4,4)),index=datas,columns=['A','B','C','D'])
df.loc['2019-01-01','C'] = np.nan
df.iloc[2,0] = np.nan
print(df)
"""
               A   B     C   D
2019-01-01   0.0   1   NaN   3
2019-01-02   4.0   5   6.0   7
2019-01-03   NaN   9  10.0  11
2019-01-04  12.0  13  14.0  15
"""

#直接去掉有NaN的行或列，使用dropna
newdf = df.dropna(
    axis=1,#0：对行进行操作，1：对列进行操作
    how='any'#‘any’：只要行或列出现nan就删除，‘all’:必须整行或整列都是nan才删除
)
print(newdf)
"""
             B   D
2019-01-01   1   3
2019-01-02   5   7
2019-01-03   9  11
2019-01-04  13  15
"""

#将nan用其他值代替，比如0
newdf = df.fillna(value=0)
print(newdf)
"""
               A   B     C   D
2019-01-01   0.0   1   0.0   3
2019-01-02   4.0   5   6.0   7
2019-01-03   0.0   9  10.0  11
2019-01-04  12.0  13  14.0  15
"""

#判断是否含有缺失数据nan，缺失用True表示
print(df.isnull())
"""
                A      B      C      D
2019-01-01  False  False   True  False
2019-01-02  False  False  False  False
2019-01-03   True  False  False  False
2019-01-04  False  False  False  False
"""

#数据很多用肉眼不易观察时使用np.any(df.isnull())==True来检测是否含有nan，如果存在就返回True
print(np.any(df.isnull()==True))
"""
True
"""

pandas的导入导出

#pandas可以读取与存取的资料格式有很多种像csv，excel，html等等，详细可以查看官方说明文件https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html
import pandas as pd
#读取
data = pd.read_csv(path)
#存储
data.to_pickle(path)

pandas的合并concat

#pandas处理多组数据的时候往往要用到数据的合并，使用concat是一种基本的合并方式
#定义数据集
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
#axis控制合并方向0：纵向操作，1：横向操作
#ignore_index=True:重置index
newdf1 = pd.concat([df1,df2,df3],axis=0)
print(newdf1)
newdf2 = pd.concat([df1,df2,df3],axis=0,ignore_index=True)
print(newdf2)
"""
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
"""

#join='outer'为默认值，将有相同column的上下合并在一起，其他独自的column自己成列，原本没有值的皆以NaN填充
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'], index=[1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['b','c','d','e'], index=[2,3,4])
print(df1)
print(df2)
newdf = pd.concat([df1,df2],join='outer',axis=0)
print(newdf)
"""
     a    b    c    d
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  0.0  0.0  0.0  0.0
     b    c    d    e
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
     a    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
3  0.0  0.0  0.0  0.0  NaN
2  NaN  1.0  1.0  1.0  1.0
3  NaN  1.0  1.0  1.0  1.0
4  NaN  1.0  1.0  1.0  1.0
"""
#join='inner'，则相同的column会被合并在一起，没有的会被抛弃
newdf = pd.concat([df1,df2],join='inner',axis=0)
print(newdf)
"""
     b    c    d
1  0.0  0.0  0.0
2  0.0  0.0  0.0
3  0.0  0.0  0.0
2  1.0  1.0  1.0
3  1.0  1.0  1.0
4  1.0  1.0  1.0
"""
newdf = pd.concat([df1,df2],join='inner',axis=1)
print(newdf)
"""
     a    b    c    d    b    c    d    e
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
"""
#按照join_axes合并,指定新矩阵的index，并且axis要选择横向
newdf = pd.concat([df1,df2],join_axes=[df1.index],axis=1)
print(newdf)
"""
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
"""
newdf = pd.concat([df1,df2],axis=1)
print(newdf)
"""
     a    b    c    d    b    c    d    e
1  0.0  0.0  0.0  0.0  NaN  NaN  NaN  NaN
2  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
3  0.0  0.0  0.0  0.0  1.0  1.0  1.0  1.0
4  NaN  NaN  NaN  NaN  1.0  1.0  1.0  1.0
"""
#append添加数据，append只有纵向合并，没有横向合并
#定义资料集
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a','b','c','d'])
s1 = pd.Series([1,2,3,4], index=['a','b','c','d'])
print(df1)
print(df2)
print(df3)
print(s1)
"""
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
     a    b    c    d
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     a    b    c    d
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
a    1
b    2
c    3
d    4
dtype: int64
"""
#将df2合并到df1下面，重置index
newdf = df1.append(df2,ignore_index=True)
print(newdf)
"""
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
"""
#合并多个df，将df2，df3合并到df1下面，并且重置index
newdf = df1.append([df2,df3],ignore_index=True)
print(newdf)
"""
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0
"""
#合并Series，将s1合并到df1，重置index
newdf = df1.append(s1,ignore_index=True)
print(newdf)
"""
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  2.0  3.0  4.0
"""

pandas合并merge

#pandas合并merge
#panads中的merge和concat类似，但是主要用于有key column的数据，统一索引的数据。
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
#依据key column合并，默认会根据相同的column进行合并，也可以使用on='key'进行指定
res = pd.merge(left, right)
print(res)
"""
  key   A   B
0  K0  A0  B0
1  K1  A1  B1
2  K2  A2  B2
3  K3  A3  B3
  key   C   D
0  K0  C0  D0
1  K1  C1  D1
2  K2  C2  D2
3  K3  C3  D3
  key   A   B   C   D
0  K0  A0  B0  C0  D0
1  K1  A1  B1  C1  D1
2  K2  A2  B2  C2  D2
3  K3  A3  B3  C3  D3
"""
#依据两组key合并
#合并时有四种方法how=['left','right','outer','inner'],默认值how='inner'
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                    'key2': ['K0', 'K1', 'K0', 'K1'],
                    'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                      'key2': ['K0', 'K0', 'K0', 'K0'],
                      'C': ['C0', 'C1', 'C2', 'C3'],
                      'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
"""
  key1 key2   A   B
0   K0   K0  A0  B0
1   K0   K1  A1  B1
2   K1   K0  A2  B2
3   K2   K1  A3  B3
  key1 key2   C   D
0   K0   K0  C0  D0
1   K1   K0  C1  D1
2   K1   K0  C2  D2
3   K2   K0  C3  D3
"""
res1 = pd.merge(left,right,how='inner')
print(res1)
"""
  key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2
"""
res2 = pd.merge(left,right,how='outer')
print(res2)
"""
  key1 key2    A    B    C    D
0   K0   K0   A0   B0   C0   D0
1   K0   K1   A1   B1  NaN  NaN
2   K1   K0   A2   B2   C1   D1
3   K1   K0   A2   B2   C2   D2
4   K2   K1   A3   B3  NaN  NaN
5   K2   K0  NaN  NaN   C3   D3
"""
res3 = pd.merge(left,right,how='left')
print(res3)
"""
  key1 key2   A   B    C    D
0   K0   K0  A0  B0   C0   D0
1   K0   K1  A1  B1  NaN  NaN
2   K1   K0  A2  B2   C1   D1
3   K1   K0  A2  B2   C2   D2
4   K2   K1  A3  B3  NaN  NaN
"""
res4 = pd.merge(left,right,how='right')
print(res4)
"""
  key1 key2    A    B   C   D
0   K0   K0   A0   B0  C0  D0
1   K1   K0   A2   B2  C1  D1
2   K1   K0   A2   B2  C2  D2
3   K2   K0  NaN  NaN  C3  D3
"""
#indicator
#indicator=True会将合并的记录放在新的一列
df1 = pd.DataFrame({'key':[0,1], 'key_left':['a','b']})
df2 = pd.DataFrame({'key':[1,2,2],'key_right':[2,2,2]})
print(df1)
print(df2)
#依据key进行合并，并启用indicator=True
res = pd.merge(df1,df2,on='key',how='outer',indicator=True)
print(res)
"""
   key key_left
0    0        a
1    1        b
   key  key_right
0    1          2
1    2          2
2    2          2
   key key_left  key_right      _merge
0    0        a        NaN   left_only
1    1        b        2.0        both
2    2      NaN        2.0  right_only
3    2      NaN        2.0  right_only
"""
#自定义indicator列的名称
res2 = pd.merge(df1,df2,on='key',how='outer',indicator='change')
print(res2)
"""
   key key_left  key_right      change
0    0        a        NaN   left_only
1    1        b        2.0        both
2    2      NaN        2.0  right_only
3    2      NaN        2.0  right_only
"""
#依据index合并
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                    index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                      'D': ['D0', 'D2', 'D3']},
                     index=['K0', 'K2', 'K3'])
print(left)
print(right)
"""
     A   B
K0  A0  B0
K1  A1  B1
K2  A2  B2
     C   D
K0  C0  D0
K2  C2  D2
K3  C3  D3
"""
#根据数据集的index进行合并，how='outer'
res = pd.merge(left,right,left_index=True,right_index=True,how='outer')
print(res)
"""
      A    B    C    D
K0   A0   B0   C0   D0
K1   A1   B1  NaN  NaN
K2   A2   B2   C2   D2
K3  NaN  NaN   C3   D3
"""
#根据数据集的index进行合并，how='inner'
res2 = pd.merge(left,right,left_index=True,right_index=True,how='inner')
print(res2)
"""
     A   B   C   D
K0  A0  B0  C0  D0
K2  A2  B2  C2  D2
"""
#使用suffixes解决合并中出现column相同但是内涵不同的问题
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys,girls,suffixes=['_boys','_girls'],on='k',how='inner')
print(res)
"""
    k  age_boys  age_girls
0  K0         1          4
1  K0         1          5
"""