笔记——数据归一化 scikit-learn中的Scaler

本文探讨了数据归一化的重要性和不同方法,包括最值归一化和均值方差归一化,展示了如何使用numpy和scikit-learn进行数据预处理,并通过KNN分类器验证了归一化对模型性能的影响。

数据归一化

在这里插入图片描述
在这里插入图片描述

除了·边界比较明显的数据集(像素),一般用均值方差归一化。

在这里插入图片描述

测试数据集要用训练集的平均数和标准差进行归一化

在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt

最值归一化 normalization

x=np.random.randint(0,100,100)
x
array([84,  5,  7, 97, 16, 15, 64, 71, 55, 58, 12,  0, 73, 41, 27, 92, 97,
       21, 29, 69, 46,  7, 70, 68, 61, 59, 65,  2, 70, 30, 34, 45, 86, 29,
       17, 21, 41, 50,  5, 51,  3, 27, 68, 25, 53, 76, 15,  9, 16, 63, 62,
       65, 39, 78, 76, 82, 83, 67, 51,  6, 32, 30, 99, 56, 65, 80, 31, 12,
        4, 33, 54, 95, 63, 87, 62, 55, 86, 27, 84, 96, 35, 54, 64, 88,  8,
       36, 99, 27, 50, 53, 95, 56, 20, 70, 15, 70, 27, 40,  4, 54])
(x-np.min(x))/(np.max(x)-np.min(x))
array([0.84848485, 0.05050505, 0.07070707, 0.97979798, 0.16161616,
       0.15151515, 0.64646465, 0.71717172, 0.55555556, 0.58585859,
       0.12121212, 0.        , 0.73737374, 0.41414141, 0.27272727,
       0.92929293, 0.97979798, 0.21212121, 0.29292929, 0.6969697 ,
       0.46464646, 0.07070707, 0.70707071, 0.68686869, 0.61616162,
       0.5959596 , 0.65656566, 0.02020202, 0.70707071, 0.3030303 ,
       0.34343434, 0.45454545, 0.86868687, 0.29292929, 0.17171717,
       0.21212121, 0.41414141, 0.50505051, 0.05050505, 0.51515152,
       0.03030303, 0.27272727, 0.68686869, 0.25252525, 0.53535354,
       0.76767677, 0.15151515, 0.09090909, 0.16161616, 0.63636364,
       0.62626263, 0.65656566, 0.39393939, 0.78787879, 0.76767677,
       0.82828283, 0.83838384, 0.67676768, 0.51515152, 0.06060606,
       0.32323232, 0.3030303 , 1.        , 0.56565657, 0.65656566,
       0.80808081, 0.31313131, 0.12121212, 0.04040404, 0.33333333,
       0.54545455, 0.95959596, 0.63636364, 0.87878788, 0.62626263,
       0.55555556, 0.86868687, 0.27272727, 0.84848485, 0.96969697,
       0.35353535, 0.54545455, 0.64646465, 0.88888889, 0.08080808,
       0.36363636, 1.        , 0.27272727, 0.50505051, 0.53535354,
       0.95959596, 0.56565657, 0.2020202 , 0.70707071, 0.15151515,
       0.70707071, 0.27272727, 0.4040404 , 0.04040404, 0.54545455])
X=np.random.randint(0,100,(50,2))
X[:10,:]
array([[55, 33],
       [51, 53],
       [40, 14],
       [10, 24],
       [90, 36],
       [76, 34],
       [45, 48],
       [86, 89],
       [88, 68],
       [ 4, 39]])
X=np.array(X,dtype=float)
X[:10,:]
array([[55., 33.],
       [51., 53.],
       [40., 14.],
       [10., 24.],
       [90., 36.],
       [76., 34.],
       [45., 48.],
       [86., 89.],
       [88., 68.],
       [ 4., 39.]])
X[:,0]=(X[:,0]-np.min(X[:,0])
# 使用本机环境的消费者聚类分析项目 当然可以直接使用本机环境!这样更简单直接。以下是完整的项目代码: ## 🎯 **环境准备** ### **检查本机环境** ```bash # 检查Python版本 python --version # 检查必要包是否已安装 python -c "import pandas, numpy, sklearn; print('所有包已安装')" ``` ### **安装必要包(如需要)** ```bash pip install pandas numpy scikit-learn ``` ## 📁 **项目结构** ``` pythonProject1/ ├── data/ │ ├── raw/consume1.csv # 原始数据 │ ├── processed/ # 处理中的数据 │ └── final/ # 最终结果数据 ├── src/ │ └── consume.py # 主分析脚本 ├── results/ # 分析结果 └── README.md # 项目说明 ``` ## 🔧 **完整分析代码** ### **在 `src/consume.py` 中:** ```python # src/consume.py - 本机环境消费者聚类分析 import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score import os from datetime import datetime def setup_project(): """创建项目目录结构""" print("=== 项目初始化 ===") directories = [ 'data/raw', 'data/processed', 'data/final', 'src', 'results', 'notebooks' ] for directory in directories: os.makedirs(directory, exist_ok=True) print(f"✅ 目录就绪: {directory}") def check_dependencies(): """检查依赖包""" print("\n=== 环境检查 ===") try: import pandas as pd import numpy as np from sklearn.cluster import KMeans print(f"✅ pandas 版本: {pd.__version__}") print(f"✅ numpy 版本: {np.__version__}") print("✅ scikit-learn 可用") return True except ImportError as e: print(f"❌ 缺少依赖包: {e}") print("请运行: pip install pandas numpy scikit-learn") return False def load_data(): """加载数据""" print("\n=== 数据加载 ===") data_file = 'data/raw/consume1.csv' if not os.path.exists(data_file): print(f"❌ 数据文件不存在: {data_file}") print("请将 consume1.csv 文件放入 data/raw/ 目录") return None try: df = pd.read_csv(data_file) print(f"✅ 数据加载成功: {len(df)} 行, {len(df.columns)} 列") return df except Exception as e: print(f"❌ 数据加载失败: {e}") return None def explore_data(df): """数据探索""" print("\n=== 数据探索 ===") print(f"📊 数据维度: {df.shape[0]} 行 × {df.shape[1]} 列") print(f"\n📋 字段列表:") for i, col in enumerate(df.columns, 1): dtype = df[col].dtype unique_count = df[col].nunique() print(f" {i:2d}. {col} ({dtype}) - {unique_count} 个唯一值") # 数据质量快速检查 missing_count = df.isnull().sum().sum() duplicate_count = df.duplicated().sum() print(f"\n✅ 数据质量报告:") print(f" 缺失值: {missing_count} 个") print(f" 重复行: {duplicate_count} 个") if missing_count == 0 and duplicate_count == 0: print(" 🎉 数据质量优秀,无需预处理") return df def create_features(df): """特征工程""" print("\n=== 特征工程 ===") feature_columns = [] # 1. 直接使用所有数值字段 numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() feature_columns.extend(numeric_cols) print(f"✅ 数值特征: {len(numeric_cols)}个") # 2. 分类字段编码 categorical_cols = df.select_dtypes(include=['object']).columns.tolist() for col in categorical_cols: # 创建虚拟变量 dummies = pd.get_dummies(df[col], prefix=col) df = pd.concat([df, dummies], axis=1) feature_columns.extend(dummies.columns.tolist()) print(f"✅ {col}编码: {len(dummies.columns)}个虚拟变量") print(f"\n🎯 总共创建 {len(feature_columns)} 个特征") return df, feature_columns def perform_clustering(df, feature_columns): """执行聚类分析""" print("\n=== 聚类分析 ===") # 选择数值特征 numeric_features = [col for col in feature_columns if pd.api.types.is_numeric_dtype(df[col])] print(f"使用 {len(numeric_features)} 个数值特征:") for feature in numeric_features[:5]: # 只显示前5个 print(f" - {feature}") if len(numeric_features) > 5: print(f" - ... 还有 {len(numeric_features)-5} 个特征") if len(numeric_features) == 0: print("❌ 错误: 没有可用的数值特征") return df, None, 0 # 准备数据 X = df[numeric_features].copy() # 数据标准化 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # 测试不同的K值 print("\n🔍 测试不同聚类数量:") best_k = 4 best_score = -1 for k in range(2, 6): kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) labels = kmeans.fit_predict(X_scaled) score = silhouette_score(X_scaled, labels) print(f" K={k}: 轮廓系数 = {score:.3f}") if score > best_score: best_score = score best_k = k print(f"✅ 选择 K={best_k} (最佳轮廓系数: {best_score:.3f})") # 执行最终聚类 final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10) df['cluster'] = final_kmeans.fit_predict(X_scaled) return df, final_kmeans, best_score def analyze_clusters(df): """分析聚类结果""" print("\n=== 聚类结果分析 ===") cluster_summary = [] for cluster_id in sorted(df['cluster'].unique()): cluster_data = df[df['cluster'] == cluster_id] size = len(cluster_data) percentage = size / len(df) * 100 print(f"\n📊 集群 {cluster_id}: {size} 人 ({percentage:.1f}%)") # 分析关键特征 if '月薪' in df.columns: avg_salary = cluster_data['月薪'].mean() print(f" 💰 平均月薪: {avg_salary:.0f} 元") if '年龄' in df.columns: avg_age = cluster_data['年龄'].mean() print(f" 🎂 平均年龄: {avg_age:.1f} 岁") # 分析分类字段的主要值 categorical_cols = df.select_dtypes(include=['object']).columns for col in categorical_cols[:3]: # 只分析前3个分类字段 if col in df.columns: top_value = cluster_data[col].value_counts().index[0] top_count = cluster_data[col].value_counts().iloc[0] top_percent = top_count / size * 100 print(f" 📝 主要{col}: {top_value} ({top_percent:.1f}%)") cluster_summary.append({ 'cluster_id': cluster_id, 'size': size, 'percentage': percentage }) return cluster_summary def create_user_profiles(df): """创建用户画像""" print("\n=== 用户画像定义 ===") profile_names = {} for cluster_id in sorted(df['cluster'].unique()): cluster_data = df[df['cluster'] == cluster_id] # 基于特征定义画像名称 if '月薪' in df.columns: avg_salary = cluster_data['月薪'].mean() overall_avg = df['月薪'].mean() if avg_salary > overall_avg * 1.2: income_level = "高收入" elif avg_salary < overall_avg * 0.8: income_level = "低收入" else: income_level = "中等收入" else: income_level = "" # 基于规模定义 cluster_sizes = [len(df[df['cluster'] == cid]) for cid in df['cluster'].unique()] if len(cluster_data) == max(cluster_sizes): size_desc = "主流" elif len(cluster_data) == min(cluster_sizes): size_desc = "小众" else: size_desc = "细分" profile_name = f"{size_desc}{income_level}消费者" profile_names[cluster_id] = profile_name print(f" 集群 {cluster_id}: {profile_name}") df['user_profile'] = df['cluster'].map(profile_names) return df, profile_names def generate_report(df, feature_count, silhouette_score, profile_names): """生成分析报告""" print("\n=== 生成分析报告 ===") timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') report = f""" 消费者行为聚类分析报告 ==================== 生成时间: {timestamp} 数据规模: {len(df)} 条记录 分析特征: {feature_count} 个 聚类质量: 轮廓系数 {silhouette_score:.3f} 用户群体分布 ------------ """ profile_counts = df['user_profile'].value_counts() for profile, count in profile_counts.items(): percentage = count / len(df) * 100 report += f"- {profile}: {count} 人 ({percentage:.1f}%)\n" report += f""" 各群体详细分析 -------------- """ for cluster_id in sorted(df['cluster'].unique()): cluster_data = df[df['cluster'] == cluster_id] profile_name = profile_names[cluster_id] report += f"\n【{profile_name}】\n" report += f"规模: {len(cluster_data)} 人\n" # 关键指标 if '月薪' in df.columns: avg_salary = cluster_data['月薪'].mean() report += f"- 平均月薪: {avg_salary:.0f} 元\n" if '年龄' in df.columns: avg_age = cluster_data['年龄'].mean() report += f"- 平均年龄: {avg_age:.1f} 岁\n" # 消费特征 categorical_fields = ['消费偏好', '消费领域', '购物动机'] for field in categorical_fields: if field in df.columns: top_value = cluster_data[field].mode()[0] if len(cluster_data[field].mode()) > 0 else "无数据" report += f"- 主要{field}: {top_value}\n" report += f""" 业务建议 -------- """ # 基于分析结果的简单建议 largest_group = df['user_profile'].value_counts().index[0] largest_size = df['user_profile'].value_counts().iloc[0] largest_percent = largest_size / len(df) * 100 report += f"1. **重点目标**: {largest_group} 占比最大 ({largest_percent:.1f}%),应作为核心目标群体\n" if '月薪' in df.columns: high_income_groups = [name for name in profile_names.values() if '高收入' in name] if high_income_groups: report += f"2. **高价值客户**: {', '.join(high_income_groups)} 具有较高消费能力\n" report += "3. **精准营销**: 根据不同群体特征制定个性化策略\n" report += "4. **产品优化**: 基于消费偏好调整产品和服务组合\n" return report def save_results(df, report): """保存分析结果""" print("\n=== 保存分析结果 ===") # 保存聚类结果 results_file = 'data/final/clustering_results.csv' df.to_csv(results_file, index=False, encoding='utf-8-sig') print(f"✅ 聚类结果: {results_file}") # 保存分析报告 report_file = 'results/analysis_report.txt' with open(report_file, 'w', encoding='utf-8') as f: f.write(report) print(f"✅ 分析报告: {report_file}") # 保存群体统计 stats_file = 'results/cluster_summary.csv' summary = df['user_profile'].value_counts().reset_index() summary.columns = ['user_profile', 'count'] summary['percentage'] = summary['count'] / len(df) * 100 summary.to_csv(stats_file, index=False, encoding='utf-8-sig') print(f"✅ 群体统计: {stats_file}") def main(): """主函数""" print("🎯 消费者行为聚类分析") print("=" * 50) # 1. 项目设置 setup_project() # 2. 检查环境 if not check_dependencies(): return # 3. 加载数据 df = load_data() if df is None: return # 4. 数据探索 df = explore_data(df) # 5. 特征工程 df, feature_columns = create_features(df) # 6. 聚类分析 df, kmeans_model, silhouette_score = perform_clustering(df, feature_columns) if kmeans_model is None: return # 7. 结果分析 cluster_summary = analyze_clusters(df) # 8. 用户画像 df, profile_names = create_user_profiles(df) # 9. 生成报告 report = generate_report(df, len(feature_columns), silhouette_score, profile_names) # 10. 保存结果 save_results(df, report) # 完成提示 print("\n🎉 分析完成!") print("=" * 50) print("📊 主要成果:") print(f" - 识别出 {len(profile_names)} 个消费者群体") print(f" - 最大群体: {df['user_profile'].value_counts().index[0]}") print(f" - 聚类质量: {silhouette_score:.3f}") print(f"\n📁 输出文件:") print(f" - data/final/clustering_results.csv") print(f" - results/analysis_report.txt") print(f" - results/cluster_summary.csv") if __name__ == "__main__": main() ``` ## 🚀 **运行项目** ### **直接运行(无需虚拟环境):** ```bash python src/consume.py ``` ### **如果遇到包缺失错误:** ```bash # 安装必要包 pip install pandas numpy scikit-learn # 然后重新运行 python src/consume.py ``` ## 📋 **项目特点** 1. ✅ **简单直接** - 使用本机环境,无需虚拟环境配置 2. ✅ **高质量数据处理** - 跳过预处理,专注分析 3. ✅ **自动特征工程** - 智能创建数值和分类特征 4. ✅ **智能聚类** - 自动选择最佳聚类数量 5. ✅ **详细分析报告** - 完整的业务洞察和建议 6. ✅ **结果持久化** - 所有输出自动保存 ## 💡 **使用提示** 1. **数据准备**: 确保 `consume1.csv` 文件在 `data/raw/` 目录中 2. **环境检查**: 代码会自动检查必要的Python包 3. **结果查看**: 分析完成后查看 `results/` 目录中的报告文件 4. **自定义调整**: 可以根据实际数据字段调整特征工程部分 帮我生成一下详细的步骤
最新发布
10-30
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值