python pandas

Pandas实战指南

最新推荐文章于 2025-07-26 15:18:57 发布

原创最新推荐文章于 2025-07-26 15:18:57 发布 · 250 阅读

0 ·

CC 4.0 BY-SA版权

python 专栏收录该内容

7 篇文章

订阅专栏

本文详细介绍Pandas库的基础操作，包括数据加载、清洗、增删改查等基本功能，并深入探讨了高级特性如分组聚合、数据透视等。此外，还介绍了如何通过Pandas进行数据统计分析，处理缺失值及实现数据的高级筛选。

import numpy as np
import pandas as pd

df = pd.DataFrame({'a': [1, 4, 2, 3, 3, 2, 3, 1],
                   'b': [4, 3, 2, 1, 1, 3, 4, 1],
                   'c': [2, 3, 1, 2, 4, 2, 4, 2]})
print(df)

# ●●●基本信息●●●
print(df.dtypes)
print(df.info())
print(df.describe())
print(df.index)
print(df.columns)

# ●●●增删改查●●●
# 列增加删除
df['d'] = [1, 3, 2, 4, 3, 3, 4, 1]  # 增加d列
df['e'] = [4, 3, 4, 2, 1, 2, 3, 4]  # 增加e列
del df['d']  # 删除列索引为d的列
df.drop(['e'], axis=1, inplace=True)  # 删除列索引为e的列

# 行增加删除
df = pd.concat([df, pd.DataFrame({'a': [4, 5], 'b': [7, 8], 'c': [10, 11]})], ignore_index=True)  # 增加行
df.drop([3, 4], inplace=True)  # 删除行索引为3和4的行

# 修改数据
df.ix[0][0] = 1
df.ix[0][1] = 2
df.ix[0][2] = 3
df.ix[0][0] = 1
df.ix[0][1] = 4
df.ix[0][2] = 2
df.replace(1, 0)
df.replace([1, 2], 0)
df.replace([1, 2], [0, 1])
df.replace({1: 0, 2: 1})

# 查询数据
print(df.a)  # 查看a列
print(df['a'])  # 查看a列
print(df[['a', 'b']])  # 查看a、b两列
print(df[0:2])  # 查看前两行
print(df.loc[0, :])  # 查看0行
print(df.iloc[0, :])  # 查看0行
print(df.loc[:, ['a', 'b']])  # 查看a、b两列
print(df.iloc[:, [0, 1]])  # 查看a、b两列
print(df.ix[[1, 3], ['a', 'b']])  # 查看a，b两列的1，3行

# 按条件选取数据
print(df[df['a'] > 1])  # 选取a列中大于1的数据
print(df[(df['a'] >= 2) & (df['a'] <= 3)])  # 选取a列中大于1小于3的数据
print(df.query('b>c'))  # 选取b大于c的数据
print(df[df.b > df.c])  # 选取b大于c的数据

# 删除重复数据
print(df.drop_duplicates())  # 删去重复行
print(df.drop_duplicates(['b']))  # 删去b列的重复元素

# 3.排序
print(df.sort_index())  # 按行索引进行排序
print(df.sort_index(axis=1))  # 按列索引进行排序
print(df.sort_values(by=['a']))  # 对a列值进行排序
print(df.sort_values(by=0, axis=1))  # 对0行值进行排序

# 4.合并
# 纵向合并
df_concat = pd.DataFrame({'a': [4, 5],
                          'b': [5, 6],
                          'c': [7, 8]})
df_concat = pd.concat([df, df_concat])  # 纵向合并
print(df_concat)

# 横向合并
df_concat = pd.DataFrame({'d': [0, 1, 2, 3, 4, 5, 6, 7]})
df_concat = pd.concat([df, df_concat], axis=1)  # 横向合并
print(df_concat)

# 5.连接
df_merge = pd.DataFrame({'a': [1, 2, 3, 4],
                         'd': [3, 4, 2, 2]})
df_merge = pd.merge(df, df_merge, on='a', how='left')
print(df_merge)

# 6.数据统计
print(df.mean())
print(df.median())
print(df.var())
print(df.max())
print(df.min())
print(df['a'].value_counts())

# 7.缺失值处理
df = df.fillna(0)
print(df)

# 8.分组
group = df.groupby('a')['b'].agg([np.mean, np.median, np.max, np.min])
group = df.groupby('a').agg({'b': ['min', 'max'], 'c': ['sum', 'max']})
group = df.groupby('a')['b'].agg(['min', 'max'])
print(group)

group = df.groupby(['a', 'c']).count()
group = group.unstack()
print(group)

group = df['b'].groupby(df['a']).count()
print(group)

# 9.透视表
table = pd.pivot_table(df, values='c', index=['a'], columns=['b'], aggfunc=np.sum)
print(table)

# 10.crosstab+reset_index
a_b_count = pd.crosstab(index=df.a, columns=df.b)  # crosstab()可以对离散变量的不同取值进行统计
print(a_b_count)
a_b_ratio = a_b_count.div(a_b_count.sum(axis=1), axis=0)
print(a_b_ratio)
a_b = a_b_ratio.reset_index()  # 重新设置df的index，范围为0~len(df)，并且将原来的行索引作为新的一列
print(a_b)
a_b = a_b_ratio.reset_index(drop=True)  # 重新设置df的index，范围为0~len(df)
print(a_b)

# 11.其他方法（自己扩展）
# (1)离散化和面元 pd.cut()
# (2)哑变量 pd.get_dummies()
# (3)重命名索引 df.rename()