今天来对空气数据从新定义一个质量评价指标。样例数据如下图。
1 导入必要的安装包
import pandas as pd
import numpy as np
data = pd.read_csv(r'E:\下载\石家庄20160701-20170701.csv')
df = pd.DataFrame(data)
print(df.head())
2 对数据做基本处理
# 质量等级编码
grade_map = {'优':1, '良':2, '轻度污染':3, '中度污染':4, '重度污染':5}
df['质量等级编码'] = df['质量等级'].map(grade_map).fillna(2)
df.drop(['日期', '质量等级'], axis=1, inplace=True)
# # 数据标准化
data_matrix = df.values
min_vals = np.min(data_matrix, axis=0)
max_vals = np.max(data_matrix, axis=0)
ranges = max_vals - min_vals
ranges[ranges == 0] = 1 # 处理常数列
normalized = (data_matrix - min_vals) / ranges
3 权重计算
# # 熵权法计算
epsilon = 1e-10
normalized += epsilon # 避免log(0)
p = normalized / np.sum(normalized, axis=0)
k = 1 / np.log(len(df))
e = -k * np.sum(p * np.log(p), axis=0)
weights = (1 - e) / np.sum(1 - e)
# # 综合得分计算
scores = np.dot(normalized, weights)
df['新空气质量指标'] = scores
4 新指标结果显示
print("各字段权重:")
for col, w in zip(df.columns, weights):
print(f"{col}: {w:.4f}")
print("\n包含新指标的数据:")
df.sort_values('新空气质量指标',inplace=True,ascending=False) # 降序显示
print(df[['AQI', 'PM2.5', 'PM10', 'SO2', 'CO', 'NO2', 'O3_8h', '新空气质量指标']])