Python中实现高效数据处理的5个实用函数-优快云博客

```python

import pandas as pd

import numpy as np

from typing import List, Union, Dict

import re

def clean_and_validate_data(df: pd.DataFrame,

numeric_columns: List[str] = None,

text_columns: List[str] = None,

date_columns: List[str] = None) -> pd.DataFrame:

数据清洗与验证函数

参数:

df: 输入的数据框

numeric_columns: 数值型列名列表

text_columns: 文本型列名列表

date_columns: 日期型列名列表

清洗后的数据框

df_clean = df.copy()

# 处理数值型数据

if numeric_columns:

for col in numeric_columns:

if col in df_clean.columns:

# 替换异常值为NaN

df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')

# 处理极端值 (使用IQR方法)

Q1 = df_clean[col].quantile(0.25)

Q3 = df_clean[col].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 IQR

upper_bound = Q3 + 1.5 IQR

df_clean[col] = np.where(

(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound),

np.nan, df_clean[col]

)

# 处理文本型数据

if text_columns:

for col in text_columns:

if col in df_clean.columns:

# 去除前后空格

df_clean[col] = df_clean[col].astype(str).str.strip()

# 处理空字符串

df_clean[col] = df_clean[col].replace(['', 'nan', 'None'], np.nan)

# 处理日期型数据

if date_columns:

for col in date_columns:

if col in df_clean.columns:

df_clean[col] = pd.to_datetime(df_clean[col], errors='coerce')

return df_clean

def handle_missing_values(df: pd.DataFrame,

strategy: str = 'mean',

categorical_strategy: str = 'mode') -> pd.DataFrame:

缺失值处理函数

参数:

df: 输入数据框

strategy: 数值型数据填充策略 ('mean', 'median', 'mode', 'ffill', 'bfill')

categorical_strategy: 分类型数据填充策略 ('mode', 'unknown')

处理缺失值后的数据框

df_filled = df.copy()

for column in df_filled.columns:

if df_filled[column].isnull().sum() > 0:

# 数值型数据

if pd.api.types.is_numeric_dtype(df_filled[column]):

if strategy == 'mean':

fill_value = df_filled[column].mean()

elif strategy == 'median':

fill_value = df_filled[column].median()

elif strategy == 'mode':

fill_value = df_filled[column].mode()[0] if not df_filled[column].mode().empty else 0

elif strategy == 'ffill':

df_filled[column] = df_filled[column].fillna(method='ffill')

continue

elif strategy == 'bfill':

df_filled[column] = df_filled[column].fillna(method='bfill')

continue

df_filled[column] = df_filled[column].fillna(fill_value)

# 分类型数据

else:

if categorical_strategy == 'mode':

fill_value = df_filled[column].mode()[0] if not df_filled[column].mode().empty else 'Unknown'

else:

fill_value = 'Unknown'

df_filled[column] = df_filled[column].fillna(fill_value)

return df_filled

def feature_engineering(df: pd.DataFrame,

numeric_features: List[str] = None,

categorical_features: List[str] = None,

date_features: List[str] = None) -> pd.DataFrame:

特征工程函数

参数:

df: 输入数据框

numeric_features: 数值型特征列

categorical_features: 分类型特征列

date_features: 日期型特征列

包含新特征的数据框

df_engineered = df.copy()

# 数值型特征工程

if numeric_features:

for col in numeric_features:

if col in df_engineered.columns:

# 创建多项式特征

df_engineered[f'{col}_squared'] = df_engineered[col] 2

df_engineered[f'{col}_log'] = np.log1p(np.abs(df_engineered[col]))

# 创建分箱特征

df_engineered[f'{col}_binned'] = pd.cut(df_engineered[col], bins=5, labels=False)

# 分类型特征工程

if categorical_features:

for col in categorical_features:

if col in df_engineered.columns:

# 频率编码

freq_encoding = df_engineered[col].value_counts().to_dict()

df_engineered[f'{col}_freq'] = df_engineered[col].map(freq_encoding)

# 目标编码 (简化版)

df_engineered[f'{col}_target_enc'] = df_engineered.groupby(col).size() / len(df_engineered)

# 日期型特征工程

if date_features:

for col in date_features:

if col in df_engineered.columns and pd.api.types.is_datetime64_any_dtype(df_engineered[col]):

df_engineered[f'{col}_year'] = df_engineered[col].dt.year

df_engineered[f'{col}_month'] = df_engineered[col].dt.month

df_engineered[f'{col}_day'] = df_engineered[col].dt.day

df_engineered[f'{col}_dayofweek'] = df_engineered[col].dt.dayofweek

df_engineered[f'{col}_is_weekend'] = df_engineered[col].dt.dayofweek.isin([5, 6]).astype(int)

return df_engineered

def data_normalization(df: pd.DataFrame,

method: str = 'standard',

columns: List[str] = None) -> pd.DataFrame:

数据标准化/归一化函数

参数:

df: 输入数据框

method: 标准化方法 ('standard', 'minmax', 'robust')

columns: 需要标准化的列列表

标准化后的数据框

df_normalized = df.copy()

if columns is None:

columns = df_normalized.select_dtypes(include=[np.number]).columns.tolist()

for col in columns:

if col in df_normalized.columns and pd.api.types.is_numeric_dtype(df_normalized[col]):

if method == 'standard':

# Z-score标准化

mean = df_normalized[col].mean()

std = df_normalized[col].std()

df_normalized[col] = (df_normalized[col] - mean) / std

elif method == 'minmax':

# 最小-最大归一化

min_val = df_normalized[col].min()

max_val = df_normalized[col].max()

if max_val != min_val:

df_normalized[col] = (df_normalized[col] - min_val) / (max_val - min_val)

elif method == 'robust':

# 鲁棒标准化

median = df_normalized[col].median()

iqr = df_normalized[col].quantile(0.75) - df_normalized[col].quantile(0.25)

if iqr != 0:

df_normalized[col] = (df_normalized[col] - median) / iqr

return df_normalized

def data_aggregation(df: pd.DataFrame,

group_columns: List[str],

agg_rules: Dict[str, Union[str, List[str]]],

time_column: str = None,

time_freq: str = 'D') -> pd.DataFrame:

数据聚合函数

参数:

df: 输入数据框

group_columns: 分组列

agg_rules: 聚合规则字典 {列名: 聚合方法}

time_column: 时间列(用于时间序列聚合)

time_freq: 时间频率 ('D'-天, 'W'-周, 'M'-月, 'Q'-季度, 'Y'-年)

聚合后的数据框

df_agg = df.copy()

# 设置时间索引(如果提供时间列)

if time_column and time_column in df_agg.columns:

if not pd.api.types.is_datetime64_any_dtype(df_agg[time_column]):

df_agg[time_column] = pd.to_datetime(df_agg[time_column])

df_agg = df_agg.set_index(time_column)

# 执行聚合

if time_column:

# 时间序列聚合

grouped = df_agg.groupby([pd.Grouper(freq=time_freq)] + group_columns)

else:

# 普通分组聚合

grouped = df_agg.groupby(group_columns)

# 应用聚合规则

result = grouped.agg(agg_rules)

# 重置索引

if time_column:

result = result.reset_index()

return result

# 使用示例

if __name__ == __main__:

# 创建示例数据

sample_data = {

'date': pd.date_range('2023-01-01', periods=100, freq='D'),

'category': np.random.choice(['A', 'B', 'C'], 100),

'value1': np.random.normal(100, 15, 100),

'value2': np.random.randint(1, 1000, 100)

}

df_sample = pd.DataFrame(sample_data)

# 添加一些缺失值和异常值

df_sample.loc[10:15, 'value1'] = np.nan

df_sample.loc[20, 'value2'] = 5000 # 异常值

print(原始数据形状:, df_sample.shape)

# 1. 数据清洗

df_clean = clean_and_validate_data(

df_sample,

numeric_columns=['value1', 'value2'],

text_columns=['category'],

date_columns=['date']

)

print(清洗后数据形状:, df_clean.shape)

# 2. 缺失值处理

df_filled = handle_missing_values(df_clean, strategy='median')

print(缺失值处理后:, df_filled.isnull().sum().sum())

# 3. 特征工程

df_features = feature_engineering(

df_filled,

numeric_features=['value1', 'value2'],

categorical_features=['category'],

date_features=['date']

)

print(特征工程后列数:, len(df_features.columns))

# 4. 数据标准化

df_normalized = data_normalization(

df_features,

method='standard',

columns=['value1', 'value2']

)

print(标准化完成)

# 5. 数据聚合

df_aggregated = data_aggregation(

df_normalized,

group_columns=['category'],

agg_rules={

'value1': ['mean', 'std', 'min', 'max'],

'value2': ['sum', 'mean']

time_column='date',

time_freq='W'

)

print(聚合后数据形状:, df_aggregated.shape)

print(数据处理流程完成!)

```