import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings(‘ignore’)
设置中文字体显示
plt.rcParams[“font.family”] = [“SimHei”, “WenQuanYi Micro Hei”, “Heiti TC”]
plt.rcParams[‘axes.unicode_minus’] = False
class HairLossAnalyzer:
def init(self, data_path=None):
“”“初始化分析器,可选择从文件加载数据或使用示例数据”“”
self.df = None
if data_path:
self.load_data(data_path)
else:
self.generate_sample_data()
self.prepare_data()
def generate_sample_data(self):
"""生成示例数据用于演示分析过程"""
np.random.seed(42)
n_samples = 500
# 生成特征数据
data = {
'age': np.random.normal(35, 10, n_samples).astype(int),
'gender': np.random.choice(['男', '女'], n_samples),
'genetic_history': np.random.choice(['是', '否'], n_samples, p=[0.3, 0.7]),
'stress_level': np.random.choice(['高', '中', '低'], n_samples, p=[0.4, 0.4, 0.2]),
'sleep_hours': np.random.normal(6.5, 1.5, n_samples).clip(3, 10),
'exercise_frequency': np.random.choice(['很少', '偶尔', '经常'], n_samples),
'smoking': np.random.choice(['是', '否'], n_samples),
'alcohol': np.random.choice(['是', '否'], n_samples),
'vitamin_d': np.random.normal(50, 20, n_samples), # 维生素D水平
'ferritin': np.random.normal(80, 40, n_samples), # 铁蛋白水平
'zinc': np.random.normal(10, 2, n_samples), # 锌水平
'hair_loss_degree': np.random.choice(['无', '轻度', '中度', '重度'], n_samples)
}
# 调整数据使某些特征与脱发程度相关
# 年龄越大、压力越高、遗传史阳性的人更可能脱发严重
for i in range(n_samples):
if data['age'][i] > 45 and data['genetic_history'][i] == '是':
data['hair_loss_degree'][i] = np.random.choice(['中度', '重度'], p=[0.6, 0.4])
elif data['stress_level'][i] == '高' and data['sleep_hours'][i] < 5:
data['hair_loss_degree'][i] = np.random.choice(['轻度', '中度'], p=[0.4, 0.6])
self.df = pd.DataFrame(data)
def load_data(self, data_path):
"""从CSV文件加载数据"""
try:
self.df = pd.read_csv(data_path)
print(f"成功加载数据,共{len(self.df)}条记录")
except Exception as e:
print(f"加载数据失败: {e}")
print("将使用示例数据进行分析")
self.generate_sample_data()
def prepare_data(self):
"""数据预处理与特征工程"""
# 创建营养缺乏指标
self.df['vitamin_d_deficiency'] = (self.df['vitamin_d'] < 30).astype(int)
self.df['ferritin_deficiency'] = (self.df['ferritin'] < 50).astype(int)
self.df['zinc_deficiency'] = (self.df['zinc'] < 8).astype(int)
self.df['nutrient_deficiency_count'] = (
self.df['vitamin_d_deficiency'] +
self.df['ferritin_deficiency'] +
self.df['zinc_deficiency']
)
self.df['nutrient_deficiency'] = (self.df['nutrient_deficiency_count'] > 0).astype(int)
# 创建压力分数
stress_mapping = {'低': 1, '中': 2, '高': 3}
self.df['stress_score'] = self.df['stress_level'].map(stress_mapping)
# 年龄分箱
bins = [0, 20, 30, 40, 50, 100]
labels = ['0-20岁', '21-30岁', '31-40岁', '41-50岁', '50+岁']
self.df['age_group'] = pd.cut(self.df['age'], bins=bins, labels=labels)
# 编码分类变量
self.df['gender_code'] = self.df['gender'].map({'男': 1, '女': 0})
self.df['genetic_code'] = self.df['genetic_history'].map({'是': 1, '否': 0})
self.df['smoking_code'] = self.df['smoking'].map({'是': 1, '否': 0})
self.df['alcohol_code'] = self.df['alcohol'].map({'是': 1, '否': 0})
# 运动频率转为有序变量
exercise_mapping = {'很少': 1, '偶尔': 2, '经常': 3}
self.df['exercise_code'] = self.df['exercise_frequency'].map(exercise_mapping)
# 创建脱发严重程度的数值编码(用于建模)
severity_mapping = {'无': 0, '轻度': 1, '中度': 2, '重度': 3}
self.df['hair_loss_severity'] = self.df['hair_loss_degree'].map(severity_mapping)
print("数据预处理完成")
def explore_data(self):
"""数据探索性分析"""
print("\n数据概览:")
print(self.df.info())
# 基本统计描述
print("\n数据基本统计:")
print(self.df.describe())
# 脱发程度分布
plt.figure(figsize=(10, 6))
sns.countplot(x='hair_loss_degree', data=self.df, order=['无', '轻度', '中度', '重度'])
plt.title('脱发程度分布')
plt.xlabel('脱发程度')
plt.ylabel('人数')
plt.tight_layout()
plt.savefig('hair_loss_distribution.png')
plt.close()
# 各年龄段脱发情况
plt.figure(figsize=(12, 6))
age_hairloss = pd.crosstab(self.df['age_group'], self.df['hair_loss_degree'])
age_hairloss.plot(kind='bar', ax=plt.gca())
plt.title('各年龄段脱发程度分布')
plt.xlabel('年龄段')
plt.ylabel('人数')
plt.tight_layout()
plt.savefig('hair_loss_by_age.png')
plt.close()
# 性别与脱发程度关系
plt.figure(figsize=(10, 6))
gender_hairloss = pd.crosstab(self.df['gender'], self.df['hair_loss_degree'])
gender_hairloss.plot(kind='bar', ax=plt.gca())
plt.title('性别与脱发程度关系')
plt.xlabel('性别')
plt.ylabel('人数')
plt.tight_layout()
plt.savefig('hair_loss_by_gender.png')
plt.close()
# 营养缺乏与脱发程度
plt.figure(figsize=(10, 6))
sns.countplot(x='hair_loss_degree', hue='nutrient_deficiency', data=self.df)
plt.title('营养缺乏与脱发程度关系')
plt.xlabel('脱发程度')
plt.ylabel('人数')
plt.legend(title='营养缺乏', labels=['否', '是'])
plt.tight_layout()
plt.savefig('hair_loss_by_nutrition.png')
plt.close()
print("数据探索性分析完成,图表已保存")
def genetic_nutrition_analysis(self):
"""遗传与营养对脱发的交互影响分析"""
# 筛选遗传史阳性和阴性两组数据
genetic_positive = self.df[self.df['genetic_code'] == 1]
genetic_negative = self.df[self.df['genetic_code'] == 0]
# 分析两组的营养缺乏情况
pos_nutrient_deficiency = genetic_positive['nutrient_deficiency'].mean()
neg_nutrient_deficiency = genetic_negative['nutrient_deficiency'].mean()
# 分析两组的脱发严重程度
pos_severity = genetic_positive['hair_loss_severity'].mean()
neg_severity = genetic_negative['hair_loss_severity'].mean()
# 可视化营养缺乏比例对比
plt.figure(figsize=(10, 6))
plt.bar(['遗传史阳性', '遗传史阴性'],
[pos_nutrient_deficiency, neg_nutrient_deficiency],
color=['salmon', 'lightblue'])
plt.title('遗传史与营养缺乏比例关系')
plt.xlabel('遗传史')
plt.ylabel('营养缺乏比例')
plt.tight_layout()
plt.savefig('genetic_nutrition_deficiency.png')
plt.close()
# 可视化脱发严重程度对比
plt.figure(figsize=(10, 6))
plt.bar(['遗传史阳性', '遗传史阴性'],
[pos_severity, neg_severity],
color=['salmon', 'lightblue'])
plt.title('遗传史与脱发严重程度关系')
plt.xlabel('遗传史')
plt.ylabel('平均脱发严重程度')
plt.ylim(0, 3)
plt.yticks([0, 1, 2, 3], ['无', '轻度', '中度', '重度'])
plt.tight_layout()
plt.savefig('genetic_hair_loss_severity.png')
plt.close()
# 交叉表分析遗传+营养缺乏与脱发程度
cross_data = self.df.copy()
cross_data['genetic_status'] = cross_data['genetic_code'].map({1: '有遗传史', 0: '无遗传史'})
cross_data['nutrition_status'] = cross_data['nutrient_deficiency'].map({1: '营养缺乏', 0: '营养正常'})
cross_table = pd.crosstab(
[cross_data['genetic_status'], cross_data['nutrition_status']],
cross_data['hair_loss_degree']
)
# 计算各组合的重度脱发比例
cross_table['重度比例'] = cross_table['重度'] / cross_table.sum(axis=1)
print("\n遗传与营养对脱发的交互影响:")
print(f"遗传史阳性组营养缺乏比例: {pos_nutrient_deficiency:.2%}")
print(f"遗传史阴性组营养缺乏比例: {neg_nutrient_deficiency:.2%}")
print(f"遗传史阳性组平均脱发严重程度: {pos_severity:.2f}")
print(f"遗传史阴性组平均脱发严重程度: {neg_severity:.2f}")
print("\n遗传与营养缺乏组合下的脱发程度分布:")
print(cross_table)
return cross_table
def stress_analysis(self):
"""压力因素分析"""
# 不同压力水平下的脱发程度分布
plt.figure(figsize=(12, 6))
stress_hairloss = pd.crosstab(self.df['stress_level'], self.df['hair_loss_degree'])
stress_hairloss.plot(kind='bar', ax=plt.gca())
plt.title('不同压力水平下的脱发程度分布')
plt.xlabel('压力水平')
plt.ylabel('人数')
plt.tight_layout()
plt.savefig('hair_loss_by_stress.png')
plt.close()
# 压力与其他因素的相关性
stress_corr = self.df[['stress_score', 'sleep_hours', 'exercise_code',
'vitamin_d', 'ferritin', 'zinc', 'hair_loss_severity']].corr()
# 可视化相关性热图
plt.figure(figsize=(10, 8))
sns.heatmap(stress_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('压力与其他因素的相关性热图')
plt.tight_layout()
plt.savefig('stress_correlation_heatmap.png')
plt.close()
# 按压力分组分析其他因素
stress_groups = self.df.groupby('stress_level')
stress_summary = {}
for level, group in stress_groups:
summary = {
'平均睡眠时长': group['sleep_hours'].mean(),
'营养缺乏比例': group['nutrient_deficiency'].mean(),
'平均脱发严重程度': group['hair_loss_severity'].mean(),
'遗传史阳性比例': group['genetic_code'].mean(),
'吸烟者比例': group['smoking_code'].mean()
}
stress_summary[level] = summary
stress_summary_df = pd.DataFrame(stress_summary).T
print("\n不同压力水平下的其他因素对比:")
print(stress_summary_df)
return stress_summary_df
def cluster_analysis(self, n_clusters=3):
"""聚类分析:识别不同特征的脱发人群"""
# 选择用于聚类的特征
cluster_features = [
'age', 'stress_score', 'sleep_hours', 'exercise_code',
'genetic_code', 'smoking_code', 'alcohol_code',
'vitamin_d', 'ferritin', 'zinc',
'nutrient_deficiency_count', 'hair_loss_severity'
]
# 提取特征数据并标准化
X = self.df[cluster_features].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 执行K-means聚类
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
self.df['cluster'] = kmeans.fit_predict(X_scaled)
# 分析各聚类中心特征
cluster_centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=cluster_features
)
cluster_centers['cluster_size'] = pd.Series(self.df['cluster'].value_counts().sort_index())
# 可视化聚类结果(选择两个重要特征)
plt.figure(figsize=(10, 8))
sns.scatterplot(
x='age',
y='stress_score',
hue='cluster',
palette='viridis',
s=100,
data=self.df,
alpha=0.7
)
plt.title(f'基于K-means的脱发人群聚类分析 (k={n_clusters})')
plt.xlabel('年龄')
plt.ylabel('压力分数')
plt.tight_layout()
plt.savefig('hair_loss_clusters.png')
plt.close()
# 可视化各聚类的特征雷达图
self._plot_cluster_radar(cluster_centers, cluster_features)
print(f"\n聚类分析完成 (k={n_clusters})")
print("各聚类特征中心点:")
print(cluster_centers)
return cluster_centers
def _plot_cluster_radar(self, cluster_centers, features):
"""绘制聚类结果的雷达图"""
# 选择要在雷达图中显示的特征
radar_features = [
'age', 'stress_score', 'sleep_hours',
'genetic_code', 'nutrient_deficiency_count', 'hair_loss_severity'
]
# 提取雷达图所需数据
radar_data = cluster_centers[radar_features].copy()
# 标准化数据以便于比较
for feature in radar_features:
min_val = radar_data[feature].min()
max_val = radar_data[feature].max()
radar_data[feature] = (radar_data[feature] - min_val) / (max_val - min_val)
# 设置雷达图
angles = np.linspace(0, 2*np.pi, len(radar_features), endpoint=False).tolist()
angles += angles[:1] # 闭合雷达图
# 创建图表
fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
# 为每个聚类绘制雷达图
for i, cluster in enumerate(radar_data.index):
values = radar_data.loc[cluster].tolist()
values += values[:1] # 闭合雷达图
ax.plot(angles, values, linewidth=2, label=f'聚类 {cluster}')
ax.fill(angles, values, alpha=0.1)
# 设置坐标轴
ax.set_thetagrids(np.degrees(angles[:-1]), radar_features)
ax.set_ylim(0, 1.1)
plt.title('各聚类特征比较雷达图')
plt.legend(loc='upper right')
plt.tight_layout()
plt.savefig('cluster_radar_chart.png')
plt.close()
def build_prediction_model(self):
"""构建脱发预测模型"""
# 选择特征和目标变量
features = [
'age', 'gender_code', 'genetic_code', 'stress_score',
'sleep_hours', 'exercise_code', 'smoking_code', 'alcohol_code',
'vitamin_d', 'ferritin', 'zinc', 'nutrient_deficiency_count'
]
X = self.df[features]
y = self.df['hair_loss_severity']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# 训练随机森林分类器
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# 预测并评估模型
y_pred = model.predict(X_test)
# 打印分类报告
print("\n模型评估报告:")
print(classification_report(y_test, y_pred))
# 可视化混淆矩阵
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['无', '轻度', '中度', '重度'],
yticklabels=['无', '轻度', '中度', '重度'])
plt.title('脱发预测模型混淆矩阵')
plt.xlabel('预测类别')
plt.ylabel('实际类别')
plt.tight_layout()
plt.savefig('confusion_matrix.png')
plt.close()
# 特征重要性分析
feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.barplot(x=feature_importance, y=feature_importance.index)
plt.title('特征重要性分析')
plt.xlabel('重要性')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()
print("\n特征重要性排序:")
print(feature_importance)
return model, feature_importance
def save_analysis_results(self, output_path='hair_loss_analysis_results.csv'):
"""保存分析结果到CSV文件"""
self.df.to_csv(output_path, index=False)
print(f"\n分析结果已保存到 {output_path}")
if name == “main”:
# 创建分析器实例
analyzer = HairLossAnalyzer()
# 执行探索性分析
analyzer.explore_data()
# 分析遗传与营养对脱发的影响
genetic_nutrition_results = analyzer.genetic_nutrition_analysis()
# 分析压力因素
stress_results = analyzer.stress_analysis()
# 执行聚类分析(尝试不同簇数)
cluster_results_k3 = analyzer.cluster_analysis(n_clusters=3)
cluster_results_k4 = analyzer.cluster_analysis(n_clusters=4)
# 构建预测模型
model, feature_importance = analyzer.build_prediction_model()
# 保存分析结果
analyzer.save_analysis_results()
print("\n脱发因素分析与预测完成!")
print("生成的图表包括:")
print("- 脱发程度分布 (hair_loss_distribution.png)")
print("- 各年龄段脱发情况 (hair_loss_by_age.png)")
print("- 性别与脱发程度关系 (hair_loss_by_gender.png)")
print("- 营养缺乏与脱发程度 (hair_loss_by_nutrition.png)")
print("- 遗传史与营养缺乏比例关系 (genetic_nutrition_deficiency.png)")
print("- 遗传史与脱发严重程度关系 (genetic_hair_loss_severity.png)")
print("- 不同压力水平下的脱发程度分布 (hair_loss_by_stress.png)")
print("- 压力与其他因素的相关性热图 (stress_correlation_heatmap.png)")
print("- 基于K-means的脱发人群聚类分析 (hair_loss_clusters.png)")
print("- 各聚类特征比较雷达图 (cluster_radar_chart.png)")
print("- 脱发预测模型混淆矩阵 (confusion_matrix.png)")
print("- 特征重要性分析 (feature_importance.png)")
8789

被折叠的 条评论
为什么被折叠?



