【无标题】

最新推荐文章于 2025-12-05 17:02:52 发布

原创最新推荐文章于 2025-12-05 17:02:52 发布 · 249 阅读

CC 4.0 BY-SA版权

文章标签：

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings(‘ignore’)

设置中文字体显示

plt.rcParams[“font.family”] = [“SimHei”, “WenQuanYi Micro Hei”, “Heiti TC”]
plt.rcParams[‘axes.unicode_minus’] = False

class HairLossAnalyzer:
def init(self, data_path=None):
“”“初始化分析器，可选择从文件加载数据或使用示例数据”“”
self.df = None
if data_path:
self.load_data(data_path)
else:
self.generate_sample_data()
self.prepare_data()

def generate_sample_data(self):
    """生成示例数据用于演示分析过程"""
    np.random.seed(42)
    n_samples = 500
    
    # 生成特征数据
    data = {
        'age': np.random.normal(35, 10, n_samples).astype(int),
        'gender': np.random.choice(['男', '女'], n_samples),
        'genetic_history': np.random.choice(['是', '否'], n_samples, p=[0.3, 0.7]),
        'stress_level': np.random.choice(['高', '中', '低'], n_samples, p=[0.4, 0.4, 0.2]),
        'sleep_hours': np.random.normal(6.5, 1.5, n_samples).clip(3, 10),
        'exercise_frequency': np.random.choice(['很少', '偶尔', '经常'], n_samples),
        'smoking': np.random.choice(['是', '否'], n_samples),
        'alcohol': np.random.choice(['是', '否'], n_samples),
        'vitamin_d': np.random.normal(50, 20, n_samples),  # 维生素D水平
        'ferritin': np.random.normal(80, 40, n_samples),   # 铁蛋白水平
        'zinc': np.random.normal(10, 2, n_samples),        # 锌水平
        'hair_loss_degree': np.random.choice(['无', '轻度', '中度', '重度'], n_samples)
    }
    
    # 调整数据使某些特征与脱发程度相关
    # 年龄越大、压力越高、遗传史阳性的人更可能脱发严重
    for i in range(n_samples):
        if data['age'][i] > 45 and data['genetic_history'][i] == '是':
            data['hair_loss_degree'][i] = np.random.choice(['中度', '重度'], p=[0.6, 0.4])
        elif data['stress_level'][i] == '高' and data['sleep_hours'][i] < 5:
            data['hair_loss_degree'][i] = np.random.choice(['轻度', '中度'], p=[0.4, 0.6])
    
    self.df = pd.DataFrame(data)

def load_data(self, data_path):
    """从CSV文件加载数据"""
    try:
        self.df = pd.read_csv(data_path)
        print(f"成功加载数据，共{len(self.df)}条记录")
    except Exception as e:
        print(f"加载数据失败: {e}")
        print("将使用示例数据进行分析")
        self.generate_sample_data()

def prepare_data(self):
    """数据预处理与特征工程"""
    # 创建营养缺乏指标
    self.df['vitamin_d_deficiency'] = (self.df['vitamin_d'] < 30).astype(int)
    self.df['ferritin_deficiency'] = (self.df['ferritin'] < 50).astype(int)
    self.df['zinc_deficiency'] = (self.df['zinc'] < 8).astype(int)
    self.df['nutrient_deficiency_count'] = (
        self.df['vitamin_d_deficiency'] + 
        self.df['ferritin_deficiency'] + 
        self.df['zinc_deficiency']
    )
    self.df['nutrient_deficiency'] = (self.df['nutrient_deficiency_count'] > 0).astype(int)
    
    # 创建压力分数
    stress_mapping = {'低': 1, '中': 2, '高': 3}
    self.df['stress_score'] = self.df['stress_level'].map(stress_mapping)
    
    # 年龄分箱
    bins = [0, 20, 30, 40, 50, 100]
    labels = ['0-20岁', '21-30岁', '31-40岁', '41-50岁', '50+岁']
    self.df['age_group'] = pd.cut(self.df['age'], bins=bins, labels=labels)
    
    # 编码分类变量
    self.df['gender_code'] = self.df['gender'].map({'男': 1, '女': 0})
    self.df['genetic_code'] = self.df['genetic_history'].map({'是': 1, '否': 0})
    self.df['smoking_code'] = self.df['smoking'].map({'是': 1, '否': 0})
    self.df['alcohol_code'] = self.df['alcohol'].map({'是': 1, '否': 0})
    
    # 运动频率转为有序变量
    exercise_mapping = {'很少': 1, '偶尔': 2, '经常': 3}
    self.df['exercise_code'] = self.df['exercise_frequency'].map(exercise_mapping)
    
    # 创建脱发严重程度的数值编码（用于建模）
    severity_mapping = {'无': 0, '轻度': 1, '中度': 2, '重度': 3}
    self.df['hair_loss_severity'] = self.df['hair_loss_degree'].map(severity_mapping)
    
    print("数据预处理完成")

def explore_data(self):
    """数据探索性分析"""
    print("\n数据概览:")
    print(self.df.info())
    
    # 基本统计描述
    print("\n数据基本统计:")
    print(self.df.describe())
    
    # 脱发程度分布
    plt.figure(figsize=(10, 6))
    sns.countplot(x='hair_loss_degree', data=self.df, order=['无', '轻度', '中度', '重度'])
    plt.title('脱发程度分布')
    plt.xlabel('脱发程度')
    plt.ylabel('人数')
    plt.tight_layout()
    plt.savefig('hair_loss_distribution.png')
    plt.close()
    
    # 各年龄段脱发情况
    plt.figure(figsize=(12, 6))
    age_hairloss = pd.crosstab(self.df['age_group'], self.df['hair_loss_degree'])
    age_hairloss.plot(kind='bar', ax=plt.gca())
    plt.title('各年龄段脱发程度分布')
    plt.xlabel('年龄段')
    plt.ylabel('人数')
    plt.tight_layout()
    plt.savefig('hair_loss_by_age.png')
    plt.close()
    
    # 性别与脱发程度关系
    plt.figure(figsize=(10, 6))
    gender_hairloss = pd.crosstab(self.df['gender'], self.df['hair_loss_degree'])
    gender_hairloss.plot(kind='bar', ax=plt.gca())
    plt.title('性别与脱发程度关系')
    plt.xlabel('性别')
    plt.ylabel('人数')
    plt.tight_layout()
    plt.savefig('hair_loss_by_gender.png')
    plt.close()
    
    # 营养缺乏与脱发程度
    plt.figure(figsize=(10, 6))
    sns.countplot(x='hair_loss_degree', hue='nutrient_deficiency', data=self.df)
    plt.title('营养缺乏与脱发程度关系')
    plt.xlabel('脱发程度')
    plt.ylabel('人数')
    plt.legend(title='营养缺乏', labels=['否', '是'])
    plt.tight_layout()
    plt.savefig('hair_loss_by_nutrition.png')
    plt.close()
    
    print("数据探索性分析完成，图表已保存")

def genetic_nutrition_analysis(self):
    """遗传与营养对脱发的交互影响分析"""
    # 筛选遗传史阳性和阴性两组数据
    genetic_positive = self.df[self.df['genetic_code'] == 1]
    genetic_negative = self.df[self.df['genetic_code'] == 0]
    
    # 分析两组的营养缺乏情况
    pos_nutrient_deficiency = genetic_positive['nutrient_deficiency'].mean()
    neg_nutrient_deficiency = genetic_negative['nutrient_deficiency'].mean()
    
    # 分析两组的脱发严重程度
    pos_severity = genetic_positive['hair_loss_severity'].mean()
    neg_severity = genetic_negative['hair_loss_severity'].mean()
    
    # 可视化营养缺乏比例对比
    plt.figure(figsize=(10, 6))
    plt.bar(['遗传史阳性', '遗传史阴性'], 
            [pos_nutrient_deficiency, neg_nutrient_deficiency], 
            color=['salmon', 'lightblue'])
    plt.title('遗传史与营养缺乏比例关系')
    plt.xlabel('遗传史')
    plt.ylabel('营养缺乏比例')
    plt.tight_layout()
    plt.savefig('genetic_nutrition_deficiency.png')
    plt.close()
    
    # 可视化脱发严重程度对比
    plt.figure(figsize=(10, 6))
    plt.bar(['遗传史阳性', '遗传史阴性'], 
            [pos_severity, neg_severity], 
            color=['salmon', 'lightblue'])
    plt.title('遗传史与脱发严重程度关系')
    plt.xlabel('遗传史')
    plt.ylabel('平均脱发严重程度')
    plt.ylim(0, 3)
    plt.yticks([0, 1, 2, 3], ['无', '轻度', '中度', '重度'])
    plt.tight_layout()
    plt.savefig('genetic_hair_loss_severity.png')
    plt.close()
    
    # 交叉表分析遗传+营养缺乏与脱发程度
    cross_data = self.df.copy()
    cross_data['genetic_status'] = cross_data['genetic_code'].map({1: '有遗传史', 0: '无遗传史'})
    cross_data['nutrition_status'] = cross_data['nutrient_deficiency'].map({1: '营养缺乏', 0: '营养正常'})
    
    cross_table = pd.crosstab(
        [cross_data['genetic_status'], cross_data['nutrition_status']],
        cross_data['hair_loss_degree']
    )
    
    # 计算各组合的重度脱发比例
    cross_table['重度比例'] = cross_table['重度'] / cross_table.sum(axis=1)
    
    print("\n遗传与营养对脱发的交互影响:")
    print(f"遗传史阳性组营养缺乏比例: {pos_nutrient_deficiency:.2%}")
    print(f"遗传史阴性组营养缺乏比例: {neg_nutrient_deficiency:.2%}")
    print(f"遗传史阳性组平均脱发严重程度: {pos_severity:.2f}")
    print(f"遗传史阴性组平均脱发严重程度: {neg_severity:.2f}")
    print("\n遗传与营养缺乏组合下的脱发程度分布:")
    print(cross_table)
    
    return cross_table

def stress_analysis(self):
    """压力因素分析"""
    # 不同压力水平下的脱发程度分布
    plt.figure(figsize=(12, 6))
    stress_hairloss = pd.crosstab(self.df['stress_level'], self.df['hair_loss_degree'])
    stress_hairloss.plot(kind='bar', ax=plt.gca())
    plt.title('不同压力水平下的脱发程度分布')
    plt.xlabel('压力水平')
    plt.ylabel('人数')
    plt.tight_layout()
    plt.savefig('hair_loss_by_stress.png')
    plt.close()
    
    # 压力与其他因素的相关性
    stress_corr = self.df[['stress_score', 'sleep_hours', 'exercise_code', 
                          'vitamin_d', 'ferritin', 'zinc', 'hair_loss_severity']].corr()
    
    # 可视化相关性热图
    plt.figure(figsize=(10, 8))
    sns.heatmap(stress_corr, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('压力与其他因素的相关性热图')
    plt.tight_layout()
    plt.savefig('stress_correlation_heatmap.png')
    plt.close()
    
    # 按压力分组分析其他因素
    stress_groups = self.df.groupby('stress_level')
    stress_summary = {}
    
    for level, group in stress_groups:
        summary = {
            '平均睡眠时长': group['sleep_hours'].mean(),
            '营养缺乏比例': group['nutrient_deficiency'].mean(),
            '平均脱发严重程度': group['hair_loss_severity'].mean(),
            '遗传史阳性比例': group['genetic_code'].mean(),
            '吸烟者比例': group['smoking_code'].mean()
        }
        stress_summary[level] = summary
    
    stress_summary_df = pd.DataFrame(stress_summary).T
    print("\n不同压力水平下的其他因素对比:")
    print(stress_summary_df)
    
    return stress_summary_df

def cluster_analysis(self, n_clusters=3):
    """聚类分析：识别不同特征的脱发人群"""
    # 选择用于聚类的特征
    cluster_features = [
        'age', 'stress_score', 'sleep_hours', 'exercise_code', 
        'genetic_code', 'smoking_code', 'alcohol_code',
        'vitamin_d', 'ferritin', 'zinc',
        'nutrient_deficiency_count', 'hair_loss_severity'
    ]
    
    # 提取特征数据并标准化
    X = self.df[cluster_features].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 执行K-means聚类
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    self.df['cluster'] = kmeans.fit_predict(X_scaled)
    
    # 分析各聚类中心特征
    cluster_centers = pd.DataFrame(
        scaler.inverse_transform(kmeans.cluster_centers_),
        columns=cluster_features
    )
    cluster_centers['cluster_size'] = pd.Series(self.df['cluster'].value_counts().sort_index())
    
    # 可视化聚类结果（选择两个重要特征）
    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x='age', 
        y='stress_score', 
        hue='cluster', 
        palette='viridis',
        s=100,
        data=self.df,
        alpha=0.7
    )
    plt.title(f'基于K-means的脱发人群聚类分析 (k={n_clusters})')
    plt.xlabel('年龄')
    plt.ylabel('压力分数')
    plt.tight_layout()
    plt.savefig('hair_loss_clusters.png')
    plt.close()
    
    # 可视化各聚类的特征雷达图
    self._plot_cluster_radar(cluster_centers, cluster_features)
    
    print(f"\n聚类分析完成 (k={n_clusters})")
    print("各聚类特征中心点:")
    print(cluster_centers)
    
    return cluster_centers

def _plot_cluster_radar(self, cluster_centers, features):
    """绘制聚类结果的雷达图"""
    # 选择要在雷达图中显示的特征
    radar_features = [
        'age', 'stress_score', 'sleep_hours', 
        'genetic_code', 'nutrient_deficiency_count', 'hair_loss_severity'
    ]
    
    # 提取雷达图所需数据
    radar_data = cluster_centers[radar_features].copy()
    
    # 标准化数据以便于比较
    for feature in radar_features:
        min_val = radar_data[feature].min()
        max_val = radar_data[feature].max()
        radar_data[feature] = (radar_data[feature] - min_val) / (max_val - min_val)
    
    # 设置雷达图
    angles = np.linspace(0, 2*np.pi, len(radar_features), endpoint=False).tolist()
    angles += angles[:1]  # 闭合雷达图
    
    # 创建图表
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True))
    
    # 为每个聚类绘制雷达图
    for i, cluster in enumerate(radar_data.index):
        values = radar_data.loc[cluster].tolist()
        values += values[:1]  # 闭合雷达图
        ax.plot(angles, values, linewidth=2, label=f'聚类 {cluster}')
        ax.fill(angles, values, alpha=0.1)
    
    # 设置坐标轴
    ax.set_thetagrids(np.degrees(angles[:-1]), radar_features)
    ax.set_ylim(0, 1.1)
    plt.title('各聚类特征比较雷达图')
    plt.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig('cluster_radar_chart.png')
    plt.close()

def build_prediction_model(self):
    """构建脱发预测模型"""
    # 选择特征和目标变量
    features = [
        'age', 'gender_code', 'genetic_code', 'stress_score', 
        'sleep_hours', 'exercise_code', 'smoking_code', 'alcohol_code',
        'vitamin_d', 'ferritin', 'zinc', 'nutrient_deficiency_count'
    ]
    
    X = self.df[features]
    y = self.df['hair_loss_severity']
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # 训练随机森林分类器
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # 预测并评估模型
    y_pred = model.predict(X_test)
    
    # 打印分类报告
    print("\n模型评估报告:")
    print(classification_report(y_test, y_pred))
    
    # 可视化混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
               xticklabels=['无', '轻度', '中度', '重度'],
               yticklabels=['无', '轻度', '中度', '重度'])
    plt.title('脱发预测模型混淆矩阵')
    plt.xlabel('预测类别')
    plt.ylabel('实际类别')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()
    
    # 特征重要性分析
    feature_importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(x=feature_importance, y=feature_importance.index)
    plt.title('特征重要性分析')
    plt.xlabel('重要性')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    print("\n特征重要性排序:")
    print(feature_importance)
    
    return model, feature_importance

def save_analysis_results(self, output_path='hair_loss_analysis_results.csv'):
    """保存分析结果到CSV文件"""
    self.df.to_csv(output_path, index=False)
    print(f"\n分析结果已保存到 {output_path}")

if name == “main”:
# 创建分析器实例
analyzer = HairLossAnalyzer()

# 执行探索性分析
analyzer.explore_data()

# 分析遗传与营养对脱发的影响
genetic_nutrition_results = analyzer.genetic_nutrition_analysis()

# 分析压力因素
stress_results = analyzer.stress_analysis()

# 执行聚类分析（尝试不同簇数）
cluster_results_k3 = analyzer.cluster_analysis(n_clusters=3)
cluster_results_k4 = analyzer.cluster_analysis(n_clusters=4)

# 构建预测模型
model, feature_importance = analyzer.build_prediction_model()

# 保存分析结果
analyzer.save_analysis_results()

print("\n脱发因素分析与预测完成！")
print("生成的图表包括:")
print("- 脱发程度分布 (hair_loss_distribution.png)")
print("- 各年龄段脱发情况 (hair_loss_by_age.png)")
print("- 性别与脱发程度关系 (hair_loss_by_gender.png)")
print("- 营养缺乏与脱发程度 (hair_loss_by_nutrition.png)")
print("- 遗传史与营养缺乏比例关系 (genetic_nutrition_deficiency.png)")
print("- 遗传史与脱发严重程度关系 (genetic_hair_loss_severity.png)")
print("- 不同压力水平下的脱发程度分布 (hair_loss_by_stress.png)")
print("- 压力与其他因素的相关性热图 (stress_correlation_heatmap.png)")
print("- 基于K-means的脱发人群聚类分析 (hair_loss_clusters.png)")
print("- 各聚类特征比较雷达图 (cluster_radar_chart.png)")
print("- 脱发预测模型混淆矩阵 (confusion_matrix.png)")
print("- 特征重要性分析 (feature_importance.png)")