行业类别
% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import warnings
import seaborn as sns
import scipy. stats as st
import os
import re
warnings. filterwarnings( 'ignore' )
pd. set_option( 'display.max_rows' , None )
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'max_colwidth' , None )
pd. set_option( "display.max_info_columns" , 2000 )
df = pd. read_excel( '数据分析助理笔试题.xls' , skiprows= 1 )
df. head( ) . append( df. tail( ) )
数据期 行业代码 经纬度 注册资本(人民币元) 企业经营日期 企业登记设立日期 企业缴纳社保人员数 0 201512 K7010 113.857559,22.581071 500000.0 2013-12-01 00:00:00 2012-07-05 00:00:00 59.0 1 201512 F5132 113.844955,22.715052 8175400.0 2012-11-01 00:00:00 2012-09-06 00:00:00 285.0 2 201512 F5261 113.857559,22.581071 25000000.0 2013-04-01 00:00:00 2012-07-27 00:00:00 178.0 3 201512 C2231 113.856368,22.736860 4000000.0 2012-12-01 00:00:00 2012-08-22 00:00:00 28.0 4 201512 L7212 113.803833,22.760572 8000000.0 2014-01-01 00:00:00 2012-08-02 00:00:00 31.0 22216 201912 C3971 NaN 20000000.0 2019-12-01 00:00:00 2019-10-11 00:00:00 NaN 22217 201912 N7810 113.883831,22.554986 NaN 00:00:00 00:00:00 NaN 22218 201912 S9221 113.830132,22.742925 NaN 2018-03-01 00:00:00 00:00:00 NaN 22219 201912 N7810 113.859368,22.618116 NaN 2018-04-01 00:00:00 00:00:00 NaN 22220 201912 P8321 113.811647,22.733314 NaN 00:00:00 00:00:00 NaN
df. shape
(22221, 7)
df. drop_duplicates( subset= None , keep= 'first' , inplace= True )
df. shape
(21204, 7)
df[ '类别' ] = df[ '行业代码' ] . apply ( lambda x: x[ : 1 ] )
df. head( ) . append( df. tail( ) )
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21204 entries, 0 to 22220
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 数据期 21204 non-null int64
1 行业代码 21204 non-null object
2 经纬度 18491 non-null object
3 注册资本(人民币元) 18928 non-null float64
4 企业经营日期 21204 non-null object
5 企业登记设立日期 21204 non-null object
6 企业缴纳社保人员数 20106 non-null float64
7 类别 21204 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 1.5+ MB
industry = df. copy( )
industry_group = industry. groupby( [ '类别' ] )
writer = pd. ExcelWriter( 'industry.xls' , engine= 'xlsxwriter' )
for name, group in industry_group:
group. to_excel( writer, sheet_name= name)
writer. save( )
date_group = industry. groupby( [ '数据期' ] )
date_num_industry = [ ]
date = [ ]
num_industry = [ ]
for name, group in date_group:
date. append( name// 100 )
num_industry. append( group. shape[ 0 ] )
date_num_industry. append( [ name, group. shape[ 0 ] ] )
print ( '{}年{}月,行业数量为{}' . format ( str ( name) [ : 4 ] , str ( name) [ - 2 : ] , group. shape[ 0 ] ) )
2015年12月,行业数量为1957
2016年12月,行业数量为3476
2017年12月,行业数量为4418
2018年12月,行业数量为5250
2019年12月,行业数量为6103
date_num_industry, date, num_industry
([[201512, 1957],
[201612, 3476],
[201712, 4418],
[201812, 5250],
[201912, 6103]],
[2015, 2016, 2017, 2018, 2019],
[1957, 3476, 4418, 5250, 6103])
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
fig = plt. figure( figsize= ( 6 , 4 ) )
title = plt. suptitle( '产业总量的变化' , fontsize= 16 )
fig. subplots_adjust( top= 0.92 , wspace= 0.3 )
ax = fig. add_subplot( 111 )
ax. set_xlabel( '数据期' , fontsize= 12 )
ax. set_ylabel( '企业数量' , fontsize= 12 )
ax. bar( date, num_industry, color= 'yellowgreen' )
ax. spines[ 'top' ] . set_visible( False )
ax. spines[ 'right' ] . set_visible( False )
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dWSroCv1-1635240656601)(output_16_0.png)]
s_industry = industry. groupby( [ '数据期' , '类别' ] )
date_type = [ ]
cate_ind = [ ]
num_ind = [ ]
for inx, gp in s_industry:
date_type. append( inx[ 0 ] )
cate_ind. append( inx[ 1 ] )
num_ind. append( gp. shape[ 0 ] )
date_ind_change = pd. DataFrame( )
date_ind_change[ '数据期' ] = date_type
date_ind_change[ '类别' ] = cate_ind
date_ind_change[ '数量' ] = num_ind
date_ind_change
plt. figure( figsize= ( 16 , 10 ) )
cp = sns. countplot( x= '类别' , hue= '数据期' , data= industry)
plt. xlabel( '类别' , fontsize= 14 )
plt. ylabel( '产业数量' , fontsize= 14 )
plt. title( '不同时期下各类别产业的数量统计图' , fontsize= 18 )
Text(0.5, 1.0, '不同时期下各类别产业的数量统计图')
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vANRudNo-1635240656607)(output_20_1.png)]
for idx, gp in s_industry:
print ( idx, gp. shape[ 0 ] )
(201512, 'C') 1208
(201512, 'D') 5
(201512, 'E') 50
(201512, 'F') 281
(201512, 'G') 39
(201512, 'H') 60
(201512, 'I') 18
(201512, 'K') 85
(201512, 'L') 142
(201512, 'M') 24
(201512, 'N') 1
(201512, 'O') 13
(201512, 'P') 18
(201512, 'Q') 7
(201512, 'R') 6
(201612, 'C') 2356
(201612, 'D') 7
(201612, 'E') 66
(201612, 'F') 418
(201612, 'G') 119
(201612, 'H') 69
(201612, 'I') 37
(201612, 'J') 1
(201612, 'K') 114
(201612, 'L') 189
(201612, 'M') 41
(201612, 'N') 4
(201612, 'O') 16
(201612, 'P') 22
(201612, 'Q') 9
(201612, 'R') 8
(201712, 'C') 2793
(201712, 'D') 6
(201712, 'E') 77
(201712, 'F') 705
(201712, 'G') 152
(201712, 'H') 96
(201712, 'I') 64
(201712, 'K') 157
(201712, 'L') 221
(201712, 'M') 58
(201712, 'N') 9
(201712, 'O') 20
(201712, 'P') 28
(201712, 'Q') 9
(201712, 'R') 12
(201712, 'S') 11
(201812, 'C') 3233
(201812, 'D') 6
(201812, 'E') 90
(201812, 'F') 912
(201812, 'G') 204
(201812, 'H') 138
(201812, 'I') 92
(201812, 'K') 185
(201812, 'L') 242
(201812, 'M') 61
(201812, 'N') 6
(201812, 'O') 18
(201812, 'P') 38
(201812, 'Q') 9
(201812, 'R') 16
(201912, 'C') 3698
(201912, 'D') 10
(201912, 'E') 139
(201912, 'F') 1049
(201912, 'G') 218
(201912, 'H') 170
(201912, 'I') 90
(201912, 'J') 1
(201912, 'K') 245
(201912, 'L') 293
(201912, 'M') 73
(201912, 'N') 13
(201912, 'O') 21
(201912, 'P') 44
(201912, 'Q') 10
(201912, 'R') 16
(201912, 'S') 13
industry. head( )
数据期 行业代码 经纬度 注册资本(人民币元) 企业经营日期 企业登记设立日期 企业缴纳社保人员数 类别 0 201512 K7010 113.857559,22.581071 500000.0 2013-12-01 00:00:00 2012-07-05 00:00:00 59.0 K 1 201512 F5132 113.844955,22.715052 8175400.0 2012-11-01 00:00:00 2012-09-06 00:00:00 285.0 F 2 201512 F5261 113.857559,22.581071 25000000.0 2013-04-01 00:00:00 2012-07-27 00:00:00 178.0 F 3 201512 C2231 113.856368,22.736860 4000000.0 2012-12-01 00:00:00 2012-08-22 00:00:00 28.0 C 4 201512 L7212 113.803833,22.760572 8000000.0 2014-01-01 00:00:00 2012-08-02 00:00:00 31.0 L
industry[ '经度' ] = industry[ '经纬度' ] . apply ( lambda x: str ( x) [ : 10 ] )
industry[ '纬度' ] = industry[ '经纬度' ] . apply ( lambda x: str ( x) [ 11 : ] )
del industry[ '经纬度' ]
数据期 行业代码 注册资本(人民币元) 企业经营日期 企业登记设立日期 企业缴纳社保人员数 类别 经度 纬度 0 201512 K7010 500000.0 2013-12-01 00:00:00 2012-07-05 00:00:00 59.0 K 113.857559 22.581071 1 201512 F5132 8175400.0 2012-11-01 00:00:00 2012-09-06 00:00:00 285.0 F 113.844955 22.715052 2 201512 F5261 25000000.0 2013-04-01 00:00:00 2012-07-27 00:00:00 178.0 F 113.857559 22.581071 3 201512 C2231 4000000.0 2012-12-01 00:00:00 2012-08-22 00:00:00 28.0 C 113.856368 22.736860 4 201512 L7212 8000000.0 2014-01-01 00:00:00 2012-08-02 00:00:00 31.0 L 113.803833 22.760572
industry[ '数据期' ] = industry[ '数据期' ] . apply ( lambda x: x// 100 )
industry. head( )
数据期 行业代码 注册资本(人民币元) 企业经营日期 企业登记设立日期 企业缴纳社保人员数 类别 经度 纬度 0 2015 K7010 500000.0 2013-12-01 00:00:00 2012-07-05 00:00:00 59.0 K 113.857559 22.581071 1 2015 F5132 8175400.0 2012-11-01 00:00:00 2012-09-06 00:00:00 285.0 F 113.844955 22.715052 2 2015 F5261 25000000.0 2013-04-01 00:00:00 2012-07-27 00:00:00 178.0 F 113.857559 22.581071 3 2015 C2231 4000000.0 2012-12-01 00:00:00 2012-08-22 00:00:00 28.0 C 113.856368 22.736860 4 2015 L7212 8000000.0 2014-01-01 00:00:00 2012-08-02 00:00:00 31.0 L 113.803833 22.760572
industry. to_csv( 'industry.csv' , index= False )
china = ['D', 'J', 'N', 'Q', 'S']
n_china = ['C', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'R']
data_china = industry.copy()
data_china = data_china[~data_china['类别'].isin(n_china)]
plt.figure(figsize=(10, 8))
cp = sns.countplot(x='类别', hue='数据期', data=data_china)
plt.xlabel('类别',fontsize=14) #X轴的名称
plt.ylabel('产业数量',fontsize=14) #Y轴名称
plt.title('不同时期下国家控制类产业的数量统计图',fontsize=18) #图的名称
data_life = industry.copy()
data_life = data_life[data_life['类别'].isin(['C', 'F'])]
plt.figure(figsize=(6, 4))
cp = sns.countplot(x='类别', hue='数据期', data=data_life)
plt.xlabel('类别',fontsize=14) #X轴的名称
plt.ylabel('产业数量',fontsize=14) #Y轴名称
plt.title('人们日常消费相关产业的数量统计图',fontsize=18) #图的名称
data_other = industry.copy()
data_other = data_other[data_other['类别'].isin(['E', 'G', 'H', 'I', 'K', 'L', 'M', 'O', 'P', 'R'])]
plt.figure(figsize=(10, 8))
cp = sns.countplot(x='类别', hue='数据期', data=data_other)
plt.xlabel('类别',fontsize=14) #X轴的名称
plt.ylabel('产业数量',fontsize=14) #Y轴名称
plt.title('其他产业的数量统计图',fontsize=18) #图的名称```