# read the Not Stated values in as NaN.
dete_survey = pd.read_csv("dete_survey.csv", na_values ="Not Stated")# Remove columns we don't need for our analysis
dete_survey_updated = dete_survey.drop(dete_survey.columns[28:49], axis=1)# Clean the column names
dete_survey_updated.columns = dete_survey_updated.columns.str.lower().str.strip().str.replace(' ','_')# Update column names to match the names in dete_survey_updated
mapping ={'Record ID':'id','CESSATION YEAR':'cease_date','Reason for ceasing employment':'separationtype','Gender. What is your Gender?':'gender','CurrentAge. Current Age':'age','Employment Type. Employment Type':'employment_status','Classification. Classification':'position','LengthofServiceOverall. Overall Length of Service at Institute (in years)':'institute_service','LengthofServiceCurrent. Length of Service at current workplace (in years)':'role_service'}
tafe_survey_updated = tafe_survey_updated.rename(mapping, axis =1)# Check the unique values for the separationtype column
tafe_survey_updated['separationtype'].value_counts()# Update all separation types containing the word "resignation" to 'Resignation'
dete_survey_updated['separationtype']= dete_survey_updated['separationtype'].str.split('-', expand =True)[0]# Extract the years and convert them to a float type
dete_resignations['cease_date']= dete_resignations['cease_date'].str.split('/').str[-1]
dete_resignations['cease_date']= dete_resignations['cease_date'].astype("float")# Check the unique values and look for outliers, sort_values()用于排序
dete_resignations['dete_start_date'].value_counts().sort_values()# Update the values in the contributing factors columns to be either True, False, or NaNdefupdate_vals(x):if x =='-':returnFalseelif pd.isnull(x):return np.nan
else:returnTrue
tafe_resignations['dissatisfied']= tafe_resignations[['Contributing Factors. Dissatisfaction','Contributing Factors. Job Dissatisfaction']].applymap(update_vals).any(axis =1, skipna=False)
tafe_resignations_up = tafe_resignations.copy()# Check the unique values after the updates
tafe_resignations_up['dissatisfied'].value_counts(dropna=False)# Combine the dataframes
combined = pd.concat([dete_resignations_up, tafe_resignations_up], ignore_index=True)# Verify the number of non null values in each column
combined.notnull().sum().sort_values()# Drop columns with less than 500 non null values
combined_updated = combined.dropna(thresh =500, axis =1).copy()# Check the unique values
combined_updated['institute_service'].value_counts(dropna=False)# Extract the years of service and convert the type to float
combined_updated['institute_service_up']= combined_updated['institute_service'].astype('str').str.extract(r'(\d+)')
combined_updated['institute_service_up']= combined_updated['institute_service_up'].astype('float')# Convert years of service to categoriesdeftransform_service(val):if val >=11:return"Veteran"elif7<= val <11:return"Established"elif3<= val <7:return"Experienced"elif pd.isnull(val):return np.nan
else:return"New"
combined_updated['service_cat']= combined_updated['institute_service_up'].apply(transform_service)# Replace missing values with the most frequent value, False
combined_updated['dissatisfied']= combined_updated['dissatisfied'].fillna(False)# Calculate the percentage of employees who resigned due to dissatisfaction in each category
dis_pct = combined_updated.pivot_table(index='service_cat', values='dissatisfied', aggfunc='mean')# Plot the results%matplotlib inline
dis_pct.plot(kind='bar', rot=30)