以下是按照教程进行数据分析的过程
import warnings
warnings. filterwarnings( 'ignore' )
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
import missingno as msno
Train_data = pd. read_csv( 'car_train_0110.csv' , sep= ' ' )
Test_data = pd. read_csv( 'car_testA_0110.csv' , sep= ' ' )
Train_data. head( ) . append( Train_data. tail( ) )
SaleID
name
regDate
model
brand
bodyType
fuelType
gearbox
power
kilometer
...
v_14
v_15
v_16
v_17
v_18
v_19
v_20
v_21
v_22
v_23
0
134890
734
20160002
13.0
9
NaN
0.0
1.0
0
15.0
...
0.092139
0.000000
18.763832
-1.512063
-1.008718
-12.100623
-0.947052
9.077297
0.581214
3.945923
1
306648
196973
20080307
72.0
9
7.0
5.0
1.0
173
15.0
...
0.001070
0.122335
-5.685612
-0.489963
-2.223693
-0.226865
-0.658246
-3.949621
4.593618
-1.145653
2
340675
25347
20020312
18.0
12
3.0
0.0
1.0
50
12.5
...
0.064410
0.003345
-3.295700
1.816499
3.554439
-0.683675
0.971495
2.625318
-0.851922
-1.246135
3
57332
5382
20000611
38.0
8
7.0
0.0
1.0
54
15.0
...
0.069231
0.000000
-3.405521
1.497826
4.782636
0.039101
1.227646
3.040629
-0.801854
-1.251894
4
265235
173174
20030109
87.0
0
5.0
5.0
1.0
131
3.0
...
0.000099
0.001655
-4.475429
0.124138
1.364567
-0.319848
-1.131568
-3.303424
-1.998466
-1.279368
249995
10556
9332
20170003
13.0
9
NaN
NaN
1.0
58
15.0
...
0.079119
0.001447
11.782508
20.402576
-2.722772
0.462388
-4.429385
7.883413
0.698405
-1.082013
249996
146710
102110
20030511
29.0
17
3.0
0.0
0.0
61
15.0
...
0.000000
0.002342
-2.988272
1.500532
3.502201
-0.761715
-2.484556
-2.532968
-0.940266
-1.106426
249997
116066
82802
20130312
124.0
16
6.0
0.0
1.0
122
3.0
...
0.003358
0.100760
-6.939560
-1.144959
-5.337949
0.896026
-0.592565
-3.872725
2.135984
3.807554
249998
90082
65971
20121212
111.0
4
7.0
5.0
0.0
184
9.0
...
0.002974
0.008251
-7.222167
-1.383696
-5.402794
-0.409451
-1.891556
-3.104789
-3.777374
3.186218
249999
76453
56954
20051111
13.0
9
3.0
0.0
1.0
58
12.5
...
0.000000
0.009071
10.491312
-11.270043
-0.272595
-0.026478
-2.168249
-0.980042
-0.955164
-1.169593
10 rows × 40 columns
name - 汽车编码
regDate - 汽车注册时间 – ***
model - 车型编码
brand - 品牌
bodyType - 车身类型
fuelType - 燃油类型
gearbox - 变速箱
power - 汽车功率
kilometer - 汽车行驶公里 –
notRepairedDamage - 汽车有尚未修复的损坏 – ***
regionCode - 看车地区编码
seller - 销售方
offerType - 报价类型
creatDate - 广告发布时间
price - 汽车价格
Train_data. columns
Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode',
'seller', 'offerType', 'creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3',
'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
'v_13', 'v_14', 'v_15', 'v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21',
'v_22', 'v_23'],
dtype='object')
Train_data_part = Train_data. cloumns= [ 'SaleID' , 'name' , 'regDate' , 'model' , 'brand' , 'bodyType' , 'fuelType' , 'gearbox' , 'power' , 'kilometer' , 'notRepairedDamage' , 'regionCode' , 'seller' , 'offerType' , 'creatDate' , 'price' ]
Train_data_part
['SaleID',
'name',
'regDate',
'model',
'brand',
'bodyType',
'fuelType',
'gearbox',
'power',
'kilometer',
'notRepairedDamage',
'regionCode',
'seller',
'offerType',
'creatDate',
'price']
Train_data. describe( )
SaleID
name
regDate
model
brand
bodyType
fuelType
gearbox
power
kilometer
...
v_14
v_15
v_16
v_17
v_18
v_19
v_20
v_21
v_22
v_23
count
250000.000000
250000.000000
2.500000e+05
250000.000000
250000.000000
224620.000000
227510.000000
236487.000000
250000.000000
250000.000000
...
250000.000000
250000.000000
250000.000000
250000.000000
250000.000000
250000.000000
250000.000000
250000.000000
250000.000000
250000.000000
mean
185351.790768
83153.362172
2.003401e+07
44.911480
7.785236
4.563271
1.665008
0.780783
115.528412
12.577418
...
0.032489
0.030408
0.014725
0.000915
0.006273
0.006604
-0.001374
0.000609