数据处理专题（六）-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_46281518/article/details/146269043

高级 Pandas

目标

深入理解 Pandas 的高级功能。‍

学习内容

多级索引

apply 方法‍

代码示例

1. 导入 Pandas 库

import pandas as pd

2. 创建示例数据集

# 创建示例数据集data = {    '姓名': ['张三', '李四', '王五', '张三', '赵六', '李四'],    '部门': ['销售部', '市场部', '技术部', '销售部', '财务部', '市场部'],    '销售额': [120, 150, 130, 160, 140, 170],    '成本': [80, 90, 100, 110, 120, 130]}df = pd.DataFrame(data)print(f"示例数据集: \n{df}")

3. 多级索引

创建多级索引

# 设置多级索引df.set_index(['部门', '姓名'], inplace=True)print(f"设置多级索引后的数据集: \n{df}")

按多级索引选择数据

# 按多级索引选择数据selected_data = df.loc[('销售部', '张三')]print(f"按多级索引选择的数据: \n{selected_data}")

按部分索引选择数据

# 按部分索引选择数据selected_data_partial = df.loc['销售部']print(f"按部分索引选择的数据: \n{selected_data_partial}")

重置索引

# 重置索引df_reset = df.reset_index()print(f"重置索引后的数据集: \n{df_reset}")

排序多级索引

# 排序多级索引df_sorted = df.sort_index(level=['部门', '姓名'])print(f"排序多级索引后的数据集: \n{df_sorted}")

4. apply 方法

对单列应用函数

# 定义一个简单的函数def add_ten(x):    return x + 10# 对 '销售额' 列应用函数df['销售额加十'] = df['销售额'].apply(add_ten)print(f"对 '销售额' 列应用函数后的数据集: \n{df}")

对多列应用函数

# 定义一个处理多列的函数def process_row(row):    row['利润'] = row['销售额'] - row['成本']    return row# 对整个 DataFrame 应用函数df_processed = df.apply(process_row, axis=1)print(f"对整个 DataFrame 应用函数后的数据集: \n{df_processed}")

使用 lambda 表达式

# 使用 lambda 表达式df['销售额平方'] = df['销售额'].apply(lambda x: x ** 2)print(f"使用 lambda 表达式后的数据集: \n{df}")

对特定列应用多个函数

# 对 '销售额' 列应用多个函数df_aggregated = df['销售额'].agg(['sum', 'mean', 'max', 'min'])print(f"对 '销售额' 列应用多个函数后的结果: \n{df_aggregated}")

实践

使用多级索引对数据进行复杂操作。

# 导入 Pandas 库import pandas as pd# 创建示例数据集data = {    '姓名': ['张三', '李四', '王五', '张三', '赵六', '李四'],    '部门': ['销售部', '市场部', '技术部', '销售部', '财务部', '市场部'],    '销售额': [120, 150, 130, 160, 140, 170],    '成本': [80, 90, 100, 110, 120, 130]}df = pd.DataFrame(data)print(f"示例数据集: \n{df}")# 设置多级索引df.set_index(['部门', '姓名'], inplace=True)print(f"设置多级索引后的数据集: \n{df}")# 按多级索引选择数据selected_data = df.loc[('销售部', '张三')]print(f"按多级索引选择的数据: \n{selected_data}")# 按部分索引选择数据selected_data_partial = df.loc['销售部']print(f"按部分索引选择的数据: \n{selected_data_partial}")# 重置索引df_reset = df.reset_index()print(f"重置索引后的数据集: \n{df_reset}")# 排序多级索引df_sorted = df.sort_index(level=['部门', '姓名'])print(f"排序多级索引后的数据集: \n{df_sorted}")# 定义一个简单的函数def add_ten(x):    return x + 10# 对 '销售额' 列应用函数df['销售额加十'] = df['销售额'].apply(add_ten)print(f"对 '销售额' 列应用函数后的数据集: \n{df}")# 定义一个处理多列的函数def process_row(row):    row['利润'] = row['销售额'] - row['成本']    return row# 对整个 DataFrame 应用函数df_processed = df.apply(process_row, axis=1)print(f"对整个 DataFrame 应用函数后的数据集: \n{df_processed}")# 使用 lambda 表达式df['销售额平方'] = df['销售额'].apply(lambda x: x ** 2)print(f"使用 lambda 表达式后的数据集: \n{df}")# 对 '销售额' 列应用多个函数df_aggregated = df['销售额'].agg(['sum', 'mean', 'max', 'min'])print(f"对 '销售额' 列应用多个函数后的结果: \n{df_aggregated}")

小结

通过今天的练习，你应该已经掌握了 Pandas 的多级索引和 apply 方法的高级功能。多级索引可以帮助你更灵活地管理和查询数据，而 apply 方法则可以让你对数据进行复杂的自定义操作。

时间序列分析

目标

学会处理时间序列数据。‍

学习内容

日期时间类型

时间序列的切片和重采样‍

代码示例

1. 导入必要的库

import pandas as pdimport matplotlib.pyplot as plt

2. 创建示例时间序列数据集

# 创建示例时间序列数据集data = {    '日期': pd.date_range(start='2023-01-01', periods=100, freq='D'),    '收盘价': [100 + i * 0.5 + (i % 10) * 2 for i in range(100)]}df = pd.DataFrame(data)df.set_index('日期', inplace=True)print(f"示例时间序列数据集: \n{df.head()}")

3. 日期时间类型

检查日期时间类型

# 检查日期时间类型print(f"日期列的数据类型: {df.index.dtype}")

将字符串转换为日期时间

# 将字符串转换为日期时间df_str = pd.DataFrame({    '日期': ['2023-01-01', '2023-01-02', '2023-01-03'],    '收盘价': [100, 101, 102]})df_str['日期'] = pd.to_datetime(df_str['日期'])df_str.set_index('日期', inplace=True)print(f"字符串转换为日期时间后的数据集: \n{df_str}")

4. 时间序列的切片

按日期范围切片

# 按日期范围切片df_slice = df['2023-01-01':'2023-01-10']print(f"按日期范围切片后的数据集: \n{df_slice}")

按月份切片

# 按月份切片df_month_slice = df['2023-01']print(f"按月份切片后的数据集: \n{df_month_slice}")

5. 时间序列的重采样

按天重采样

# 按天重采样df_resample_daily = df.resample('D').mean()print(f"按天重采样后的数据集: \n{df_resample_daily.head()}")

按周重采样

# 按周重采样df_resample_weekly = df.resample('W').mean()print(f"按周重采样后的数据集: \n{df_resample_weekly.head()}")

按月重采样

# 按月重采样df_resample_monthly = df.resample('M').mean()print(f"按月重采样后的数据集: \n{df_resample_monthly.head()}")

按季度重采样

# 按季度重采样df_resample_quarterly = df.resample('Q').mean()print(f"按季度重采样后的数据集: \n{df_resample_quarterly.head()}")

6. 时间序列的滚动窗口

计算滚动平均

# 计算滚动平均df_rolling_mean = df['收盘价'].rolling(window=7).mean()print(f"计算滚动平均后的数据: \n{df_rolling_mean.head(10)}")

计算滚动标准差

# 计算滚动标准差df_rolling_std = df['收盘价'].rolling(window=7).std()print(f"计算滚动标准差后的数据: \n{df_rolling_std.head(10)}")

实践

分析一个股票价格的时间序列数据。

# 导入必要的库import pandas as pdimport matplotlib.pyplot as plt# 读取股票价格数据file_path = 'stock_prices.csv'df = pd.read_csv(file_path, parse_dates=['日期'], index_col='日期')print(f"原始股票价格数据集: \n{df.head()}")# 按日期范围切片df_slice = df['2023-01-01':'2023-01-31']print(f"按日期范围切片后的数据集: \n{df_slice}")# 按周重采样df_resample_weekly = df.resample('W').mean()print(f"按周重采样后的数据集: \n{df_resample_weekly.head()}")# 计算滚动平均df_rolling_mean = df['收盘价'].rolling(window=7).mean()print(f"计算滚动平均后的数据: \n{df_rolling_mean.head(10)}")# 绘制股票价格和滚动平均图plt.figure(figsize=(10, 6))plt.plot(df.index, df['收盘价'], label='收盘价', color='b')plt.plot(df_rolling_mean.index, df_rolling_mean, label='7天滚动平均', color='r')plt.xlabel('日期')plt.ylabel('价格 (元)')plt.title('股票价格及其7天滚动平均')plt.legend()plt.grid(True)plt.show()