```python
import pandas as pd
import numpy as np
from typing import List, Union, Dict
import re
def clean_and_validate_data(df: pd.DataFrame,
numeric_columns: List[str] = None,
text_columns: List[str] = None,
date_columns: List[str] = None) -> pd.DataFrame:
数据清洗与验证函数
参数:
df: 输入的数据框
numeric_columns: 数值型列名列表
text_columns: 文本型列名列表
date_columns: 日期型列名列表
返回:
清洗后的数据框
df_clean = df.copy()
# 处理数值型数据
if numeric_columns:
for col in numeric_columns:
if col in df_clean.columns:
# 替换异常值为NaN
df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
# 处理极端值 (使用IQR方法)
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 IQR
upper_bound = Q3 + 1.5 IQR
df_clean[col] = np.where(
(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound),
np.nan, df_clean[col]
)
# 处理文本型数据
if text_columns:
for col in text_columns:
if col in df_clean.columns:
# 去除前后空格
df_clean[col] = df_clean[col].astype(str).str.strip()
# 处理空字符串
df_clean[col] = df_clean[col].replace(['', 'nan', 'None'], np.nan)
# 处理日期型数据
if date_columns:
for col in date_columns:
if col in df_clean.columns:
df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')
return df_clean
def handle_missing_values(df: pd.DataFrame,
strategy: str = 'mean',
categorical_strategy: str = 'mode') -> pd.DataFrame:
缺失值处理函数
参数:
df: 输入数据框
strategy: 数值型数据填充策略 ('mean', 'median', 'mode', 'ffill', 'bfill')
categorical_strategy: 分类型数据填充策略 ('mode', 'unknown')
返回:
处理缺失值后的数据框
df_filled = df.copy()
for column in df_filled.columns:
if df_filled[column].isnull().sum() > 0:
# 数值型数据
if pd.api.types.is_numeric_dtype(df_filled[column]):
if strategy == 'mean':
fill_value = df_filled[column].mean()
elif strategy == 'median':
fill_value = df_filled[column].median()
elif strategy == 'mode':
fill_value = df_filled[column].mode()[0] if not df_filled[column].mode().empty else 0
elif strategy == 'ffill':
df_filled[column] = df_filled[column].fillna(method='ffill')
continue
elif strategy == 'bfill':
df_filled[column] = df_filled[column].fillna(method='bfill')
continue
df_filled[column] = df_filled[column].fillna(fill_value)
# 分类型数据
else:
if categorical_strategy == 'mode':
fill_value = df_filled[column].mode()[0] if not df_filled[column].mode().empty else 'Unknown'
else:
fill_value = 'Unknown'
df_filled[column] = df_filled[column].fillna(fill_value)
return df_filled
def feature_engineering(df: pd.DataFrame,
numeric_features: List[str] = None,
categorical_features: List[str] = None,
date_features: List[str] = None) -> pd.DataFrame:
特征工程函数
参数:
df: 输入数据框
numeric_features: 数值型特征列
categorical_features: 分类型特征列
date_features: 日期型特征列
返回:
包含新特征的数据框
df_engineered = df.copy()
# 数值型特征工程
if numeric_features:
for col in numeric_features:
if col in df_engineered.columns:
# 创建多项式特征
df_engineered[f'{col}_squared'] = df_engineered[col] 2
df_engineered[f'{col}_log'] = np.log1p(np.abs(df_engineered[col]))
# 创建分箱特征
df_engineered[f'{col}_binned'] = pd.cut(df_engineered[col], bins=5, labels=False)
# 分类型特征工程
if categorical_features:
for col in categorical_features:
if col in df_engineered.columns:
# 频率编码
freq_encoding = df_engineered[col].value_counts().to_dict()
df_engineered[f'{col}_freq'] = df_engineered[col].map(freq_encoding)
# 目标编码 (简化版)
df_engineered[f'{col}_target_enc'] = df_engineered.groupby(col).size() / len(df_engineered)
# 日期型特征工程
if date_features:
for col in date_features:
if col in df_engineered.columns and pd.api.types.is_datetime64_any_dtype(df_engineered[col]):
df_engineered[f'{col}_year'] = df_engineered[col].dt.year
df_engineered[f'{col}_month'] = df_engineered[col].dt.month
df_engineered[f'{col}_day'] = df_engineered[col].dt.day
df_engineered[f'{col}_dayofweek'] = df_engineered[col].dt.dayofweek
df_engineered[f'{col}_is_weekend'] = df_engineered[col].dt.dayofweek.isin([5, 6]).astype(int)
return df_engineered
def data_normalization(df: pd.DataFrame,
method: str = 'standard',
columns: List[str] = None) -> pd.DataFrame:
数据标准化/归一化函数
参数:
df: 输入数据框
method: 标准化方法 ('standard', 'minmax', 'robust')
columns: 需要标准化的列列表
返回:
标准化后的数据框
df_normalized = df.copy()
if columns is None:
columns = df_normalized.select_dtypes(include=[np.number]).columns.tolist()
for col in columns:
if col in df_normalized.columns and pd.api.types.is_numeric_dtype(df_normalized[col]):
if method == 'standard':
# Z-score标准化
mean = df_normalized[col].mean()
std = df_normalized[col].std()
df_normalized[col] = (df_normalized[col] - mean) / std
elif method == 'minmax':
# 最小-最大归一化
min_val = df_normalized[col].min()
max_val = df_normalized[col].max()
if max_val != min_val:
df_normalized[col] = (df_normalized[col] - min_val) / (max_val - min_val)
elif method == 'robust':
# 鲁棒标准化
median = df_normalized[col].median()
iqr = df_normalized[col].quantile(0.75) - df_normalized[col].quantile(0.25)
if iqr != 0:
df_normalized[col] = (df_normalized[col] - median) / iqr
return df_normalized
def data_aggregation(df: pd.DataFrame,
group_columns: List[str],
agg_rules: Dict[str, Union[str, List[str]]],
time_column: str = None,
time_freq: str = 'D') -> pd.DataFrame:
数据聚合函数
参数:
df: 输入数据框
group_columns: 分组列
agg_rules: 聚合规则字典 {列名: 聚合方法}
time_column: 时间列(用于时间序列聚合)
time_freq: 时间频率 ('D'-天, 'W'-周, 'M'-月, 'Q'-季度, 'Y'-年)
返回:
聚合后的数据框
df_agg = df.copy()
# 设置时间索引(如果提供时间列)
if time_column and time_column in df_agg.columns:
if not pd.api.types.is_datetime64_any_dtype(df_agg[time_column]):
df_agg[time_column] = pd.to_datetime(df_agg[time_column])
df_agg = df_agg.set_index(time_column)
# 执行聚合
if time_column:
# 时间序列聚合
grouped = df_agg.groupby([pd.Grouper(freq=time_freq)] + group_columns)
else:
# 普通分组聚合
grouped = df_agg.groupby(group_columns)
# 应用聚合规则
result = grouped.agg(agg_rules)
# 重置索引
if time_column:
result = result.reset_index()
return result
# 使用示例
if __name__ == __main__:
# 创建示例数据
sample_data = {
'date': pd.date_range('2023-01-01', periods=100, freq='D'),
'category': np.random.choice(['A', 'B', 'C'], 100),
'value1': np.random.normal(100, 15, 100),
'value2': np.random.randint(1, 1000, 100)
}
df_sample = pd.DataFrame(sample_data)
# 添加一些缺失值和异常值
df_sample.loc[10:15, 'value1'] = np.nan
df_sample.loc[20, 'value2'] = 5000 # 异常值
print(原始数据形状:, df_sample.shape)
# 1. 数据清洗
df_clean = clean_and_validate_data(
df_sample,
numeric_columns=['value1', 'value2'],
text_columns=['category'],
date_columns=['date']
)
print(清洗后数据形状:, df_clean.shape)
# 2. 缺失值处理
df_filled = handle_missing_values(df_clean, strategy='median')
print(缺失值处理后:, df_filled.isnull().sum().sum())
# 3. 特征工程
df_features = feature_engineering(
df_filled,
numeric_features=['value1', 'value2'],
categorical_features=['category'],
date_features=['date']
)
print(特征工程后列数:, len(df_features.columns))
# 4. 数据标准化
df_normalized = data_normalization(
df_features,
method='standard',
columns=['value1', 'value2']
)
print(标准化完成)
# 5. 数据聚合
df_aggregated = data_aggregation(
df_normalized,
group_columns=['category'],
agg_rules={
'value1': ['mean', 'std', 'min', 'max'],
'value2': ['sum', 'mean']
},
time_column='date',
time_freq='W'
)
print(聚合后数据形状:, df_aggregated.shape)
print(数据处理流程完成!)
```

被折叠的 条评论
为什么被折叠?



