合并数据

最新推荐文章于 2023-05-13 10:37:51 发布

原创最新推荐文章于 2023-05-13 10:37:51 发布 · 372 阅读

0 ·

CC 4.0 BY-SA版权

python基础专栏收录该内容

11 篇文章

订阅专栏

1、concat合并数据

import numpy as np
import pandas as pd

df1 = pd.DataFrame(np.ones((3,4))*1, columns=['A','B','C','D'])
df2 = pd.DataFrame(np.ones((3,4))*2, columns=['A','B','C','D'])
df3 = pd.DataFrame(np.ones((3,4))*3, columns=['A','B','C','D'])
print(df1)
print(df2)
print(df3)

     A    B    C    D
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     A    B    C    D
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0
     A    B    C    D
0  3.0  3.0  3.0  3.0
1  3.0  3.0  3.0  3.0
2  3.0  3.0  3.0  3.0

pd.concat([df1,df2,df3], axis=0, ignore_index=True)  #注意添加[]，axis=0表示竖向，忽略原索引，会产生新索引

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0
6	3.0	3.0	3.0	3.0
7	3.0	3.0	3.0	3.0
8	3.0	3.0	3.0	3.0

df4 = pd.DataFrame(np.ones((3,4))*1, columns=['A','B','C','D'])
df5 = pd.DataFrame(np.ones((3,4))*2, columns=['B','C','D','E'])
print(df4)
print(df5)

     A    B    C    D
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
     B    C    D    E
0  2.0  2.0  2.0  2.0
1  2.0  2.0  2.0  2.0
2  2.0  2.0  2.0  2.0

pd.concat([df4,df5], join='outer', ignore_index=True)  #join的inner参数取交集，outer取并集

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	A	B	C	D	E
0	1.0	1.0	1.0	1.0	NaN
1	1.0	1.0	1.0	1.0	NaN
2	1.0	1.0	1.0	1.0	NaN
3	NaN	2.0	2.0	2.0	2.0
4	NaN	2.0	2.0	2.0	2.0
5	NaN	2.0	2.0	2.0	2.0

df6 = pd.DataFrame(np.ones((3,4))*1, columns=['A','B','C','D'], index=[1,2,3])
df7 = pd.DataFrame(np.ones((3,4))*2, columns=['B','C','D','E'], index=[2,3,4])
pd.concat([df6,df7], axis=1, join_axes=[df6.index])  #若不加join_axes，行索引会有1，2，3，4

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	A	B	C	D	B	C	D	E
1	1.0	1.0	1.0	1.0	NaN	NaN	NaN	NaN
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
3	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

2、append合并数据

df8 = pd.DataFrame(np.ones((3,4))*1, columns=['A','B','C','D'])
s1 = pd.Series([1,2,3,4], index=['A','B','C','D'])
df8.append(s1, ignore_index=True)   #相当于在一个表里添加一条记录

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	1.0	2.0	3.0	4.0

df9 = pd.DataFrame(np.ones((3,4))*1, columns=['A','B','C','D'])
df10 = pd.DataFrame(np.ones((3,4))*2, columns=['A','B','C','D'])
df9.append(df10, ignore_index=True)  #合并两个表结构相同的表

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0

3、merge合并数据（更复杂）

left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
                    'key2':['K0','K1','K0','K1'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
                    'key2':['K0','K0','K0','K0'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})
print(left)
print(right)

    A   B key1 key2
0  A0  B0   K0   K0
1  A1  B1   K0   K1
2  A2  B2   K1   K0
3  A3  B3   K2   K1
    C   D key1 key2
0  C0  D0   K0   K0
1  C1  D1   K1   K0
2  C2  D2   K1   K0
3  C3  D3   K2   K0

pd.merge(left, right, on=['key1','key2'], how='inner')  #取key1, key2组合的交集，outer取并集，left基于left数据的的key来合并，right同理

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	A	B	key1	key2	C	D
0	A0	B0	K0	K0	C0	D0
1	A2	B2	K1	K0	C1	D1
2	A2	B2	K1	K0	C2	D2

#有时DataFrame中的连接键位于其索引中，可以传入left_index=True或right_index=True, 表明索引应该被用作连接键。
left1 = pd.DataFrame({'key':['a','a','b','c'],
         'v1':range(4)})
right1 = pd.DataFrame({'v2':[9,8]}, index=['a','b'])
print(left1)
print(right1)

pd.merge(left1, right1, left_on='key', right_index=True) #表示合并两个数据，连接键以left1的key为主；但right1的索引也应该被作为连接键。

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	key	v1	v2
0	a	0	9
1	a	1	9
2	b	2	8

#连接键重名的处理方式 
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
pd.merge(boys, girls, on='k', suffixes=['_boys','_girls'], how='outer')

.dataframe thead tr:only-child th { text-align: right; } .dataframe thead th { text-align: left; } .dataframe tbody tr th { vertical-align: top; }

	age_boys	k	age_girls
0	1.0	K0	4.0
1	1.0	K0	5.0
2	2.0	K1	NaN
3	3.0	K2	NaN
4	NaN	K3	6.0

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0
6	3.0	3.0	3.0	3.0
7	3.0	3.0	3.0	3.0
8	3.0	3.0	3.0	3.0

	A	B	C	D	E
0	1.0	1.0	1.0	1.0	NaN
1	1.0	1.0	1.0	1.0	NaN
2	1.0	1.0	1.0	1.0	NaN
3	NaN	2.0	2.0	2.0	2.0
4	NaN	2.0	2.0	2.0	2.0
5	NaN	2.0	2.0	2.0	2.0

	A	B	C	D	B	C	D	E
1	1.0	1.0	1.0	1.0	NaN	NaN	NaN	NaN
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
3	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0
6	3.0	3.0	3.0	3.0
7	3.0	3.0	3.0	3.0
8	3.0	3.0	3.0	3.0

	A	B	C	D	E
0	1.0	1.0	1.0	1.0	NaN
1	1.0	1.0	1.0	1.0	NaN
2	1.0	1.0	1.0	1.0	NaN
3	NaN	2.0	2.0	2.0	2.0
4	NaN	2.0	2.0	2.0	2.0
5	NaN	2.0	2.0	2.0	2.0

	A	B	C	D	B	C	D	E
1	1.0	1.0	1.0	1.0	NaN	NaN	NaN	NaN
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
3	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0
6	3.0	3.0	3.0	3.0
7	3.0	3.0	3.0	3.0
8	3.0	3.0	3.0	3.0

	A	B	C	D	E
0	1.0	1.0	1.0	1.0	NaN
1	1.0	1.0	1.0	1.0	NaN
2	1.0	1.0	1.0	1.0	NaN
3	NaN	2.0	2.0	2.0	2.0
4	NaN	2.0	2.0	2.0	2.0
5	NaN	2.0	2.0	2.0	2.0

	A	B	C	D	B	C	D	E
1	1.0	1.0	1.0	1.0	NaN	NaN	NaN	NaN
2	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0
3	1.0	1.0	1.0	1.0	2.0	2.0	2.0	2.0

	A	B	C	D
0	1.0	1.0	1.0	1.0
1	1.0	1.0	1.0	1.0
2	1.0	1.0	1.0	1.0
3	2.0	2.0	2.0	2.0
4	2.0	2.0	2.0	2.0
5	2.0	2.0	2.0	2.0