合并来自多个表的数据
import pandas as pd
air_quality_no2= pd. read_csv( "data/air_quality_long.csv" , parse_dates= True )
air_quality_no2= air_quality_no2[ [ "date.utc" , "location" , "parameter" , "value" ] ]
air_quality_no2. head( )
date.utc location parameter value 0 2019-06-18 06:00:00+00:00 BETR801 pm25 18.0 1 2019-06-17 08:00:00+00:00 BETR801 pm25 6.5 2 2019-06-17 07:00:00+00:00 BETR801 pm25 18.5 3 2019-06-17 06:00:00+00:00 BETR801 pm25 16.0 4 2019-06-17 05:00:00+00:00 BETR801 pm25 7.5
air_quality_pm25= pd. read_csv( "data/air_quality_pm25_long.csv" , parse_dates= True )
air_quality_pm25. head( )
city country date.utc location parameter value unit 0 Antwerpen BE 2019-06-18 06:00:00+00:00 BETR801 pm25 18.0 µg/m³ 1 Antwerpen BE 2019-06-17 08:00:00+00:00 BETR801 pm25 6.5 µg/m³ 2 Antwerpen BE 2019-06-17 07:00:00+00:00 BETR801 pm25 18.5 µg/m³ 3 Antwerpen BE 2019-06-17 06:00:00+00:00 BETR801 pm25 16.0 µg/m³ 4 Antwerpen BE 2019-06-17 05:00:00+00:00 BETR801 pm25 7.5 µg/m³
1. concat()函数:
沿一个轴(行或列)执行多个表的串联操作 默认沿行进行拼接 concat()的keys参数:提供了一个附加的(分层的)行索引
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Ina28uDp-1595754064474)(attachment:image.png)]
air_quality= pd. concat( [ air_quality_pm25, air_quality_no2] )
air_quality. head( )
city country date.utc location parameter value unit 0 Antwerpen BE 2019-06-18 06:00:00+00:00 BETR801 pm25 18.0 µg/m³ 1 Antwerpen BE 2019-06-17 08:00:00+00:00 BETR801 pm25 6.5 µg/m³ 2 Antwerpen BE 2019-06-17 07:00:00+00:00 BETR801 pm25 18.5 µg/m³ 3 Antwerpen BE 2019-06-17 06:00:00+00:00 BETR801 pm25 16.0 µg/m³ 4 Antwerpen BE 2019-06-17 05:00:00+00:00 BETR801 pm25 7.5 µg/m³
print ( "Shape of pm25:" , air_quality_pm25. shape)
print ( "Shape of no2:" , air_quality_no2. shape)
print ( "shape of quality:" , air_quality. shape)
Shape of pm25: (1110, 7)
Shape of no2: (5272, 4)
shape of quality: (6382, 7)
air_quality= air_quality. sort_values( "date.utc" )
air_quality
city country date.utc location parameter value unit 3500 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 176 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 pm25 76.0 NaN 3663 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 no2 22.5 NaN 5271 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster no2 67.0 NaN 1824 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster pm25 42.0 NaN ... ... ... ... ... ... ... ... 102 London GB 2019-06-20 23:00:00+00:00 London Westminster pm25 7.0 µg/m³ 178 NaN NaN 2019-06-20 23:00:00+00:00 London Westminster pm25 7.0 NaN 177 NaN NaN 2019-06-21 00:00:00+00:00 London Westminster pm25 7.0 NaN 1825 NaN NaN 2019-06-21 00:00:00+00:00 FR04014 no2 20.0 NaN 101 London GB 2019-06-21 00:00:00+00:00 London Westminster pm25 7.0 µg/m³
6382 rows × 7 columns
air_quality_= pd. concat( [ air_quality_pm25, air_quality_no2] , keys= [ "PM25" , "NO2" ] )
air_quality_
city country date.utc location parameter value unit PM25 0 Antwerpen BE 2019-06-18 06:00:00+00:00 BETR801 pm25 18.0 µg/m³ 1 Antwerpen BE 2019-06-17 08:00:00+00:00 BETR801 pm25 6.5 µg/m³ 2 Antwerpen BE 2019-06-17 07:00:00+00:00 BETR801 pm25 18.5 µg/m³ 3 Antwerpen BE 2019-06-17 06:00:00+00:00 BETR801 pm25 16.0 µg/m³ 4 Antwerpen BE 2019-06-17 05:00:00+00:00 BETR801 pm25 7.5 µg/m³ ... ... ... ... ... ... ... ... ... NO2 5267 NaN NaN 2019-04-09 06:00:00+00:00 London Westminster no2 41.0 NaN 5268 NaN NaN 2019-04-09 05:00:00+00:00 London Westminster no2 41.0 NaN 5269 NaN NaN 2019-04-09 04:00:00+00:00 London Westminster no2 41.0 NaN 5270 NaN NaN 2019-04-09 03:00:00+00:00 London Westminster no2 67.0 NaN 5271 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster no2 67.0 NaN
6382 rows × 7 columns
2. reset_index():将任何级别的索引转换为列
air_quality. reset_index( level= 0 )
index city country date.utc location parameter value unit 0 3500 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 1 176 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 pm25 76.0 NaN 2 3663 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 no2 22.5 NaN 3 5271 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster no2 67.0 NaN 4 1824 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster pm25 42.0 NaN ... ... ... ... ... ... ... ... ... 6377 102 London GB 2019-06-20 23:00:00+00:00 London Westminster pm25 7.0 µg/m³ 6378 178 NaN NaN 2019-06-20 23:00:00+00:00 London Westminster pm25 7.0 NaN 6379 177 NaN NaN 2019-06-21 00:00:00+00:00 London Westminster pm25 7.0 NaN 6380 1825 NaN NaN 2019-06-21 00:00:00+00:00 FR04014 no2 20.0 NaN 6381 101 London GB 2019-06-21 00:00:00+00:00 London Westminster pm25 7.0 µg/m³
6382 rows × 8 columns
merge():加入使用公共标识符表(自然连接)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9LXXyMGI-1595754064481)(attachment:image.png)]
station_coord= pd. read_csv( "data/air_quality_stations.csv" )
station_coord. head( )
location coordinates.latitude coordinates.longitude 0 BELAL01 51.23619 4.38522 1 BELHB23 51.17030 4.34100 2 BELLD01 51.10998 5.00486 3 BELLD02 51.12038 5.02155 4 BELR833 51.32766 4.36226
air_quality. head( )
city country date.utc location parameter value unit 3500 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 176 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 pm25 76.0 NaN 3663 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 no2 22.5 NaN 5271 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster no2 67.0 NaN 1824 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster pm25 42.0 NaN
air_quality= pd. merge( air_quality, station_coord, how= 'left' , on= 'location' )
air_quality. head( )
city country date.utc location parameter value unit coordinates.latitude coordinates.longitude 0 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 48.83724 2.39390 1 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 48.83722 2.39390 2 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 pm25 76.0 NaN 51.20966 4.43182 3 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 no2 22.5 NaN 51.20966 4.43182 4 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster no2 67.0 NaN 51.49467 -0.13193
air_quality_parameters = pd. read_csv( "data/air_quality_parameters.csv" )
air_quality_parameters. head( )
id description name 0 bc Black Carbon BC 1 co Carbon Monoxide CO 2 no2 Nitrogen Dioxide NO2 3 o3 Ozone O3 4 pm10 Particulate matter less than 10 micrometers in... PM10
air_quality= pd. merge( air_quality, air_quality_parameters, how= 'left' , left_on= "parameter" , right_on= "id" )
air_quality. head( )
city country date.utc location parameter value unit coordinates.latitude coordinates.longitude id_x description_x name_x id_y description_y name_y 0 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 48.83724 2.39390 no2 Nitrogen Dioxide NO2 no2 Nitrogen Dioxide NO2 1 NaN NaN 2019-04-09 01:00:00+00:00 FR04014 no2 24.4 NaN 48.83722 2.39390 no2 Nitrogen Dioxide NO2 no2 Nitrogen Dioxide NO2 2 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 pm25 76.0 NaN 51.20966 4.43182 pm25 Particulate matter less than 2.5 micrometers i... PM2.5 pm25 Particulate matter less than 2.5 micrometers i... PM2.5 3 NaN NaN 2019-04-09 01:00:00+00:00 BETR801 no2 22.5 NaN 51.20966 4.43182 no2 Nitrogen Dioxide NO2 no2 Nitrogen Dioxide NO2 4 NaN NaN 2019-04-09 02:00:00+00:00 London Westminster no2 67.0 NaN 51.49467 -0.13193 no2 Nitrogen Dioxide NO2 no2 Nitrogen Dioxide NO2
小结:
使用该concat功能,可以将多个表按行连接到两列。
对于类似数据库的表合并/联接,请使用该merge 函数。