import numpy as np
import pandas as pd
df = pd. read_csv( 'C:/Users/admin/Desktop/joyful-pandas-master/joyful-pandas-master/data/table.csv' )
df. head( )
School Class ID Gender Address Height Weight Math Physics 0 S_1 C_1 1101 M street_1 173 63 34.0 A+ 1 S_1 C_1 1102 F street_2 192 73 32.5 B+ 2 S_1 C_1 1103 M street_2 186 82 87.2 B+ 3 S_1 C_1 1104 F street_2 167 81 80.4 B- 4 S_1 C_1 1105 F street_4 159 64 84.8 B+
1、append与assign
1.1、append方法
1.1.1、利用序列添加行(必须指定name)
df_append = df. loc[ : 3 , [ 'Gender' , 'Height' ] ] . copy( )
df_append
Gender Height 0 M 173 1 F 192 2 M 186 3 F 167
s = pd. Series( { 'Gender' : 'M' , 'Height' : '187' } , name= 'row_1' )
df_append. append( s)
Gender Height 0 M 173 1 F 192 2 M 186 3 F 167 row_1 M 187
1.1.2、用DataFrame添加表
df_temp = pd. DataFrame( { 'Gender' : [ 'F' , 'F' ] , 'Height' : [ '165' , '171' ] } , index= [ 'row1' , 'row2' ] )
df_append. append( df_temp)
Gender Height 0 M 173 1 F 192 2 M 186 3 F 167 row1 F 165 row2 F 171
1.2、assign方法
该方法主要用于添加列,列名直接由参数指定: 可以一次添加多个列:
s = pd. Series( list ( 'john' ) , index= range ( 4 ) )
df_append. assign( Letter= s)
Gender Height Letter 0 M 173 j 1 F 192 o 2 M 186 h 3 F 167 n
df_append. assign( col1= lambda x: x[ 'Gender' ] * 2 , col2= s)
Gender Height col1 col2 0 M 173 MM j 1 F 192 FF o 2 M 186 MM h 3 F 167 FF n
2、combine与update
2.1、combine方法
comine和update都是用于表的填充函数,可以根据某种规则填充
2.1.1 填充对象
df_combine_1 = df. loc[ : 1 , [ 'Gender' , 'Height' ] ] . copy( )
df_combine_2 = df. loc[ 10 : 11 , [ 'Gender' , 'Height' ] ] . copy( )
df_combine_1. combine( df_combine_2, lambda x, y: print ( x, y) )
0 M
1 F
10 NaN
11 NaN
Name: Gender, dtype: object 0 NaN
1 NaN
10 M
11 F
Name: Gender, dtype: object
0 173.0
1 192.0
10 NaN
11 NaN
Name: Height, dtype: float64 0 NaN
1 NaN
10 161.0
11 175.0
Name: Height, dtype: float64
Gender Height 0 NaN NaN 1 NaN NaN 10 NaN NaN 11 NaN NaN
df1 = pd. DataFrame( { 'A' : [ 1 , 2 ] , 'B' : [ 3 , 4 ] } )
df2 = pd. DataFrame( { 'A' : [ 8 , 7 ] , 'B' : [ 6 , 5 ] } )
df1. combine( df2, lambda x, y: x if x. mean( ) > y. mean( ) else y)
索引对齐特性(默认状态下,后面的表没有的行列都会设置为NaN
df2 = pd. DataFrame( { 'B' : [ 8 , 7 ] , 'C' : [ 6 , 5 ] } , index= [ 1 , 2 ] )
df1. combine( df2, lambda x, y: x if x. mean( ) > y. mean( ) else y)
A B C 0 NaN NaN NaN 1 NaN 8.0 6.0 2 NaN 7.0 5.0
df1. combine( df2, lambda x, y: x if x. mean( ) > y. mean( ) else y, overwrite= False )
A B C 0 1.0 NaN NaN 1 2.0 8.0 6.0 2 NaN 7.0 5.0
df1. combine( df2, lambda x, y: x if x. mean( ) > y. mean( ) else y, fill_value= 1 )
A B C 0 1.0 1.0 1.0 1 2.0 8.0 6.0 2 1.0 7.0 5.0
2.1.2、combine_first方法
这个方法作用是用df2填补df1的缺失值,功能比较简单,但很多时候会比combine更常用
df1 = pd. DataFrame( { 'A' : [ 0 , None ] , 'B' : [ None , 6 ] } )
df2 = pd. DataFrame( { 'A' : [ 2 , 4 ] , 'B' : [ 1 , 1 ] } )
df1. combine_first( df2)
df1 = pd. DataFrame( { 'A' : [ None , 0 ] , 'B' : [ 4 , None ] } )
df2 = pd. DataFrame( { 'B' : [ 3 , 3 ] , 'C' : [ 1 , 1 ] } , index= [ 1 , 2 ] )
df1. combine_first( df2)
A B C 0 NaN 4.0 NaN 1 0.0 3.0 1.0 2 NaN 3.0 1.0
2、update方法
①返回的框索引只会与被调用框的一致(默认使用左连接,下一节会介绍) ②第二个框中的nan元素不会起作用 ③没有返回值,直接在df上操作
例①:索引完全对齐情况下的操作
df1 = pd. DataFrame( { 'A' : [ 1 , 2 , 3 ] , 'B' : [ 100 , 200 , 300 ] } )
df2 = pd. DataFrame( { 'B' : [ 4 , 5 , 6 ] ,
'C' : [ 7 , 8 , 9 ] } )
df1. update( df2)
df2. update( df1)
df2
例②:部分填充
df1 = pd. DataFrame( { 'A' : [ 'a' , 'b' , 'c' ] ,
'B' : [ 'x' , 'y' , 'z' ] } )
df2 = pd. DataFrame( { 'B' : [ 'd' , 'e' ] } , index= [ 1 , 2 ] )
df1. update( df2)
df1
例3:nan元素不会起作用
df1 = pd. DataFrame( { 'A' : [ 1 , 2 , 3 ] ,
'B' : [ 400 , 500 , 600 ] } )
df2 = pd. DataFrame( { 'B' : [ 4 , np. nan, 6 ] } )
df1. update( df2)
df1
3、concat方法
df1 = pd. DataFrame( { 'A' : [ 'A0' , 'A1' ] , 'B' : [ 'B0' , 'B1' ] } , index = [ 0 , 1 ] )
df2 = pd. DataFrame( { 'A' : [ 'A2' , 'A3' ] , 'B' : [ 'B2' , 'B3' ] } , index = [ 2 , 3 ] )
df3 = pd. DataFrame( { 'A' : [ 'A1' , 'A3' ] , 'D' : [ 'D1' , 'D3' ] , 'E' : [ 'E1' , 'E3' ] } , index = [ 1 , 3 ] )
pd. concat( [ df1, df2] )
pd. concat( [ df1, df2] , axis= 1 )
A B A B 0 A0 B0 NaN NaN 1 A1 B1 NaN NaN 2 NaN NaN A2 B2 3 NaN NaN A3 B3
pd. concat( [ df3, df1] , join= 'inner' )
pd. concat( [ df3, df1] , join= 'outer' , sort= True )
A B D E 1 A1 NaN D1 E1 3 A3 NaN D3 E3 0 A0 B0 NaN NaN 1 A1 B1 NaN NaN
pd. concat( [ df3, df1] , verify_integrity= True , sort= True )
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-35-ce6f08b9d774> in <module>
----> 1 pd.concat([df3,df1],verify_integrity=True,sort=True)
D:\Users\admin\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in concat(objs, axis, join, ignore_index, keys, levels, names, verify_integrity, sort, copy)
279 verify_integrity=verify_integrity,
280 copy=copy,
--> 281 sort=sort,
282 )
283
D:\Users\admin\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in __init__(self, objs, axis, join, keys, levels, names, ignore_index, verify_integrity, copy, sort)
450 self.copy = copy
451
--> 452 self.new_axes = self._get_new_axes()
453
454 def get_result(self):
D:\Users\admin\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in _get_new_axes(self)
515 return [
516 self._get_concat_axis() if i == self.axis else self._get_comb_axis(i)
--> 517 for i in range(ndim)
518 ]
519
D:\Users\admin\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in <listcomp>(.0)
515 return [
516 self._get_concat_axis() if i == self.axis else self._get_comb_axis(i)
--> 517 for i in range(ndim)
518 ]
519
D:\Users\admin\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in _get_concat_axis(self)
570 )
571
--> 572 self._maybe_check_integrity(concat_axis)
573
574 return concat_axis
D:\Users\admin\Anaconda3\lib\site-packages\pandas\core\reshape\concat.py in _maybe_check_integrity(self, concat_index)
580 raise ValueError(
581 "Indexes have overlapping values: "
--> 582 "{overlap!s}".format(overlap=overlap)
583 )
584
ValueError: Indexes have overlapping values: Int64Index([1], dtype='int64')
s = pd. Series( [ 'F0' , 'F1' ] , name= 'F' )
pd. concat( [ df1, s] , axis= 1 )
4、merge与join