1.使用pd.concat()级联
(1)简单级联
数据代码:
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df1 = make_df([1,2], ['A','B'])
df2 = make_df([3,4], ['A','B'])
print(df1)
print('#############################')
print(df2)
- axis:设置合并方向,默认垂直合并(axis=0),(axis=1)为水平合并
- ignore_indexs:为True时会忽略原有索引,重置索引
- keys:在水平或垂直索引上追加新索引,配合axis
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df1 = make_df([1,2], ['A','B'])
df2 = make_df([3,4], ['A','B'])
#默认上下(垂直)合并
print(pd.concat([df1,df2]))
print('###############################')
#左右(水平)合并
print(pd.concat([df1,df2],axis=1))
print('###############################')
#忽略索引
print(pd.concat([df1,df2],ignore_index=True))
print('###############################')
#生成追加新索引
print(pd.concat([df1,df2],keys=['x','y']))
(2)不匹配级联
不匹配是指级联的维度的索引不一致
数据代码:
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df1 = make_df([1,2,3,4],list('ABCD'))
df2 = make_df([2,3,4,5],list('BCDE'))
print(df1)
print('##################################')
print(df2)
- 外链接(join='outer'):类似并集,显示所有数据,无数据的位置用NaN填充
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df1 = make_df([1,2,3,4],list('ABCD'))
df2 = make_df([2,3,4,5],list('BCDE'))
print(pd.concat([df1,df2]))
print('##################################')
print(pd.concat([df1,df2],join='outer'))
- 内连接(join='inner'):类似交集,只显示共同部分
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df1 = make_df([1,2,3,4],list('ABCD'))
df2 = make_df([2,3,4,5],list('BCDE'))
print(pd.concat([df1,df2],join='inner'))
2.使用pd.merge()合并
- merge与concat的区别在于,merge需要依据某一共同的行或列来进行合并
- 使用pd.merge()合并时,会自动根据两者相同column名称的那一列,作为key来进行合并
- 每一列元素的顺序不要求一致
(1)一对一合并
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'name':['张三','李四','王五'],
'id':[1,2,3],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(pd.merge(df1,df2))
print('#############################')
print(df1.merge(df2))
(2)多对一合并
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'name':['张三','李四','王五'],
'id':[1,2,2],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2))
(3)多对多合并
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'name':['张三','李四','王五'],
'id':[1,2,2],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,2,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2))
(4)key的规范化
- 使用"on="显式指定哪一列为key,当2个DataFrame有多列相同时使用
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'name':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2,on='id'))
- 使用left_on和right_on指定左右两边的列作为key,当左右两边的key都不相等时使用
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id2':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2,left_on='id',right_on='id2'))
- 当左边的列和右边的index相同的时候,使用right_index=True
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id2':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2,left_index=True,right_index=True))
print('#############################')
print(df1.merge(df2,left_index=True,right_on='id2'))
(5)内合并与外合并
- 内合并(how='inner'):只保留两者都有的key(默认模式)
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(pd.merge(df1,df2)) #默认内连接
print(df1.merge(df2,how='inner'))
- 外合并(how='outer'):用NaN补全空缺
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2,how='outer',on='id'))
- 左合并、右合并:how='left'、how='right'
- 左合并:显示左边表的所有数据和右边表的公共数据
- 右合并:显示右边表的所有数据和左边表的公共数据
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'sex':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2,how='left'))
print('#############################')
print(df1.merge(df2,how='right'))
(6)列冲突的解决
当列冲突时,即有多个列名称相同时,需要使用"on="来指定哪一个列作为key,配合"suffixes="指定冲突列明
import numpy as np
import pandas as pd
df1 = pd.DataFrame({
'id':[1,2,3],
'name':['张三','李四','王五'],
'age':[22,33,44]
})
df2 = pd.DataFrame({
'id':[2,3,4],
'name':['男','女','男'],
'job':['Saler','CEO','Programer']
})
print(df1)
print('#############################')
print(df2)
print('#############################')
print(df1.merge(df2,on='id'))
print('#############################')
print(df1.merge(df2,on='id',suffixes=['_df1','_df2']))
知识点为听课总结笔记,课程为B站“千锋教育Pandas数据分析从入门到实战,零基础小白保姆级Python数据分析教程”:001_Pandas_Pandas介绍_哔哩哔哩_bilibili