Python数据科学入门(Pandas玩转数据)- 笔记03

本文介绍使用 Python 的 Pandas 库进行数据科学的基本操作,包括数据的数学运算、排序、合并、预处理、去重、时间序列处理、数据分箱、分组及聚合等关键技能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Python数据科学入门(Pandas玩转数据)

慕课网数据科学入门课程学习笔记

一、Series和DataFrame的简单数学运算

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

1.Series 相加

s1 = Series([1,2,3],index=['A','B','C'])
s1
A 1 B 2 C 3 dtype: int64
s2 = Series([4,5,6,7],index=['B','C','D','E'])
s2
B 4 C 5 D 6 E 7 dtype: int64
# 对应的 index 的值相加,没有 nan + 非 nan还是nan
s1 + s2     
A NaN B 6.0 C 8.0 D NaN E NaN dtype: float64

2.Dataframe运算

df1 = DataFrame(np.arange(4).reshape(2,2),index=['A','B'],columns=['AA','BB'])
df1
AABB
A01
B23
df2 = DataFrame(np.arange(9).reshape(3,3),index=['A','B','C'],columns=['AA','BB','CC'])
df2
AABBCC
A012
B345
C678
#加法运算
df1 + df2
AABBCC
A0.02.0NaN
B5.07.0NaN
CNaNNaNNaN
df3 = DataFrame([[1,2,3],[4,5,np.nan],[7,8,9]],
                index=['A','B','C'],columns=['c1','c2','c3'])
df3
c1c2c3
A123.0
B45NaN
C789.0
# 默认列和    这里的 nan 被忽略
# axis = 1 时求 行和
df3.sum()
c1 12.0 c2 15.0 c3 12.0 dtype: float64
# min() max()  默认求列
df3.min()
c1 1.0 c2 2.0 c3 3.0 dtype: float64
df3.describe()
c1c2c3
count3.03.02.000000
mean4.05.06.000000
std3.03.04.242641
min1.02.03.000000
25%2.53.54.500000
50%4.05.06.000000
75%5.56.57.500000
max7.08.09.000000

二、Series和DataFrame的排序

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

1.Series排序

s1 = Series(np.random.randn(10))
s1
0 0.773184 1 -0.403266 2 0.980296 3 -0.631971 4 0.636405 5 0.732921 6 0.756932 7 -0.597943 8 -1.152224 9 0.771753 dtype: float64
# 按照values 排序 默认升序
# ascending=False  降序
s1.sort_values()
8 -1.152224 3 -0.631971 7 -0.597943 1 -0.403266 4 0.636405 5 0.732921 6 0.756932 9 0.771753 0 0.773184 2 0.980296 dtype: float64
# 按照index 排序 默认升序
# ascending=False  降序
s1.sort_index(ascending=False)
9 0.771753 8 -1.152224 7 -0.597943 6 0.756932 5 0.732921 4 0.636405 3 -0.631971 2 0.980296 1 -0.403266 0 0.773184 dtype: float64

2.DataFrame排序

df1 = DataFrame(np.random.randn(20).reshape(4,5),columns=['a','b','c','d','e'])
df1
abcde
00.173908-0.978290-0.791529-1.769024-1.971549
10.553871-1.436033-1.3481790.2448971.465071
21.378478-0.8692940.797711-0.539044-1.211413
32.739525-0.111984-1.4688311.4188460.182972
#   按照 a 列排序
# 默认 升序  ascending=False 降序
# 默认 axis=0 针对列 
df1.sort_values('a',ascending=False)
abcde
32.739525-0.111984-1.4688311.4188460.182972
21.378478-0.8692940.797711-0.539044-1.211413
10.553871-1.436033-1.3481790.2448971.465071
00.173908-0.978290-0.791529-1.769024-1.971549

三、重命名DataFrame的index

import numpy as np
import pandas as pa
from pandas import Series,DataFrame

1.重命名 index

df1 = DataFrame(np.arange(9).reshape(3,3),index=['A','B','C'],columns=['AA','BB','CC'])
df1
AABBCC
A012
B345
C678
#重新给index赋值
df1.index=Series(['a','b','c'])
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
AABBCC
a012
b345
c678
#  map 方法
df1.index = df1.index.map(str.upper)
df1
# rename 方法
df1.rename(index=str.lower,columns=str.lower)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
aabbcc
a012
b345
c678
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
AABBCC
A012
B345
C678
# 通过字典 对特定的index修改
# 更多 查看文档
df1.rename(index={'A':'g'})
AABBCC
g012
B345
C678

2.复习

list1 = [1,2,3,4]
list2 = ['1','2','3','4']
# 将 list1 元素转换为 字符串
[str(x) for x in list1]
[‘1’, ‘2’, ‘3’, ‘4’]
# map 方法转换
list(map(str,list1))
[‘1’, ‘2’, ‘3’, ‘4’]

3.写一个自己的 map

def test_map(x):
    return x + '_ABC'
df1.index.map(test_map)
Index([‘A_ABC’, ‘B_ABC’, ‘C_ABC’], dtype=’object’)
df1.rename(index=test_map)
AABBCC
A_ABC012
B_ABC345
C_ABC678

四、DataFrame 的merge(合并)操作

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df1 = DataFrame({'key':['X','Y','Z'],'data_set_1':[1,2,3]})
df1
keydata_set_1
0X1
1Y2
2Z3
df2 = DataFrame({'key':['X','X','C'],'data_set_2':[4,5,6]})
df2
keydata_set_2
0X4
1X5
2C6
# 默认会去找 两个 df 都有的 key
# 可通过 on = ? 指定
# how=inner/left/outer/right  等参数查看文档
#   df1 和 df2 都有 key
pd.merge(df1,df2)
keydata_set_1data_set_2
0X14
1X15
# left 去掉右边df2 “多余”
pd.merge(df1,df2,on='key',how='left')
keydata_set_1data_set_2
0X14.0
1X15.0
2Y2NaN
3Z3NaN
# right 去掉左边df1 “多余”
pd.merge(df1,df2,on='key',how='right')
keydata_set_1data_set_2
0X1.04
1X1.05
2CNaN6
# outer 像是 left 和 right 的结合  都保留
pd.merge(df1,df2,on='key',how='outer')
keydata_set_1data_set_2
0X1.04.0
1X1.05.0
2Y2.0NaN
3Z3.0NaN
4CNaN6.0

五、Concatenaten和Combine

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

1>Concatenaten

1.Numpy array

arr1 = np.arange(9).reshape(3,3)
arr1
array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
arr2 = np.arange(9).reshape(3,3)
arr2
array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
# concatenate 默认 axis=0 竖着连接
np.concatenate([arr1,arr2])
array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [0, 1, 2], [3, 4, 5], [6, 7, 8]])

2.Series

s1 = Series([1,2,3],index=['X','Y','Z'])
s2 = Series([4,5],index=['A','B'])

pd.concat([s1,s2])
X 1 Y 2 Z 3 A 4 B 5 dtype: int64

pd.concat([s1,s2],axis=1)
01
ANaN4.0
BNaN5.0
X1.0NaN
Y2.0NaN
Z3.0NaN

3.DataFrame

df1 = DataFrame(np.random.randn(4,3),columns=['X','Y,','Z'])
df1
XY,Z
00.5429211.7108881.027810
10.624385-0.8259160.893589
2-1.0002412.5514611.541975
30.0208060.6359560.573629
df2 = DataFrame(np.random.randn(3,3),columns=['X','Y,','A'])
df2
XY,A
0-0.721702-0.106360-1.701335
10.5910090.2335780.212189
20.855895-1.8319440.247578
pd.concat([df1,df2])
AXY,Z
0NaN0.5429211.7108881.027810
1NaN0.624385-0.8259160.893589
2NaN-1.0002412.5514611.541975
3NaN0.0208060.6359560.573629
0-1.701335-0.721702-0.106360NaN
10.2121890.5910090.233578NaN
20.2475780.855895-1.831944NaN

2> Combine

1.Series

s1 = Series([2,np.nan,4,np.nan],index=['A','B','C','D'])
s1
A 2.0 B NaN C 4.0 D NaN dtype: float64
s2 = Series([1,2,3,4],index=['A','B','C','D'])
s2
A 1 B 2 C 3 D 4 dtype: int64
# s1 中的 nan 被 s2 填充
s1.combine_first(s2)
A 2.0 B 2.0 C 4.0 D 4.0 dtype: float64

2.DataFrame

df1 = DataFrame({'X':[1, np.nan, 3, np.nan],
                'Y':[5, np.nan, 7, np.nan],
                'Z':[9, np.nan, 11, np.nan]})
df1
XYZ
01.05.09.0
1NaNNaNNaN
23.07.011.0
3NaNNaNNaN
df2 = DataFrame({'Z':[np.nan, 10, np.nan, 12],
                'A':[1,2,3,4],})
df2
ZA
0NaN1
110.02
2NaN3
312.04
# df1 里的 nan 被 df2 填充
df1.combine_first(df2)
AXYZ
01.01.05.09.0
12.0NaNNaN10.0
23.03.07.011.0
34.0NaNNaN12.0

六、通过apply进行数据预处理

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('data.csv')
df.head()
timedata
01124352.0AA: BB: CC: DD
11124352.0AA: BB: CC: DD
21124352.0AA: BB: CC: DD
31124353.0AA: BB: CC: DD
41124353.0AA: BB: CC: DD
df.size
190
s1 = Series(['a']*190)
df['A'] = s1
df.head()
timedataA
01124352.0AA: BB: CC: DDa
11124352.0AA: BB: CC: DDa
21124352.0AA: BB: CC: DDa
31124353.0AA: BB: CC: DDa
41124353.0AA: BB: CC: DDa
# apply() 对一列操作
df['A'] = df['A'].apply(str.upper)
df.head()
timedataA
01124352.0AA: BB: CC: DDA
11124352.0AA: BB: CC: DDA
21124352.0AA: BB: CC: DDA
31124353.0AA: BB: CC: DDA
41124353.0AA: BB: CC: DDA

将data分割多列

# 以空格分割  
df['data'][0].split(' ')
[”, ‘AA:’, ‘BB:’, ‘CC:’, ‘DD’]
# 去掉空格干扰
list_1 = df['data'][0].strip().split(' ')
list_1
[‘AA:’, ‘BB:’, ‘CC:’, ‘DD’]

要求:将第2列和第4列取出重新放入data列

使用 head() 需要注意,如果对 df 操作时使用1了 head()会使只操作了前几行

使用 head() 来查看就好

list_1[1],list_1[3]
(‘BB:’, ‘DD’)
def foo(line):

    items = line.strip().split(' ')
    return Series([items[1],items[3]])
df_tmp = df['data'].apply(foo)
df_tmp.head()
01
0BB:DD
1BB:DD
2BB:DD
3BB:DD
4BB:DD
df_tmp = df_tmp.rename(columns={0:"B",1:"D"})
df_tmp.head()
BD
0BB:DD
1BB:DD
2BB:DD
3BB:DD
4BB:DD
df.head()
timedataA
01124352.0AA: BB: CC: DDA
11124352.0AA: BB: CC: DDA
21124352.0AA: BB: CC: DDA
31124353.0AA: BB: CC: DDA
41124353.0AA: BB: CC: DDA
# 连接操作
df_new = df.combine_first(df_tmp)
df_new.head()
ABDdatatime
0ABB:DDAA: BB: CC: DD1124352.0
1ABB:DDAA: BB: CC: DD1124352.0
2ABB:DDAA: BB: CC: DD1124352.0
3ABB:DDAA: BB: CC: DD1124353.0
4ABB:DDAA: BB: CC: DD1124353.0
# 删除一些不需要的列
del df_new['A']
del df_new['data']
df_new.head()
BDtime
0BB:DD1124352.0
1BB:DD1124352.0
2BB:DD1124352.0
3BB:DD1124353.0
4BB:DD1124353.0
# 存储 df_new.csv
df_new.to_csv('df_new.csv')
——–

七、通过去重清洗数据

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('data.csv')
df.head()
timedata
01124352AA: BB: CC: DD
11124352AA: BB: CC: DD
21124352AA: BB: CC: DD
31124353AA: BB: CC: DD
41124353FF: BB: CC: DD
df.size
190
len(df)
95
# 查看 time 列 不重复的数据 的数量
len(df['time'].unique())
22

去重

# 判重 的方法
df['time'].duplicated().head()
0 False 1 True 2 True 3 False 4 True Name: time, dtype: bool
# 对 duplicated() 方法返回 True 的数据删掉,达到去重
df_new = df.drop_duplicates()
df_new.head()
timedata
01124352AA: BB: CC: DD
31124353AA: BB: CC: DD
41124353FF: BB: CC: DD
61124354AA: BB: CC: DD
71124354CC: BB : CC: DD
#  以 data 列为标准来删除重复,keep='last' 表示保存最后一个
df_new = df.drop_duplicates({'data'},keep='last')
df_new.head()
timedata
41124353FF: BB: CC: DD
71124354CC: BB : CC: DD
941124373AA: BB: CC: DD

八、时间序列的操作基础

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from datetime import datetime
t1 = datetime(2009,10,20)
t1
datetime.datetime(2009, 10, 20, 0, 0)
date_list = [
    datetime(2016,9,1),
    datetime(2016,9,10),
    datetime(2017,9,1),
    datetime(2017,9,10),
    datetime(2017,10,1)
]
date_list
[datetime.datetime(2016, 9, 1, 0, 0), datetime.datetime(2016, 9, 10, 0, 0), datetime.datetime(2017, 9, 1, 0, 0), datetime.datetime(2017, 9, 10, 0, 0), datetime.datetime(2017, 10, 1, 0, 0)]
s1 = Series(np.random.rand(5),index=date_list)
s1
2016-09-01 0.497374 2016-09-10 0.686276 2017-09-01 0.788562 2017-09-10 0.765383 2017-10-01 0.546197 dtype: float64
s1[1]
0.6862758320157275
s1[datetime(2016,9,10)]
0.6862758320157275
s1['2016-9-10']
0.6862758320157275
s1['20160910']
0.6862758320157275
s1['2016-9']
2016-09-01 0.497374 2016-09-10 0.686276 dtype: float64
s1['2017']
2017-09-01 0.788562 2017-09-10 0.765383 2017-10-01 0.546197 dtype: float64
# date_range(start,end,periods...)  随机生成时间 
# freq = 'D/W/'  D 间隔为 天   
# W间隔为周,W-MON 从周一开始(默认从周日开始)默认 D
# H 间隔为 小时  5H 间隔为 5小时
date_list_new = pd.date_range(start='2017-01-01',periods=10)
date_list_new
DatetimeIndex([‘2017-01-01’, ‘2017-01-02’, ‘2017-01-03’, ‘2017-01-04…)
s2 = Series(np.random.rand(10),index=date_list_new)
s2
2017-01-01 0.152626 2017-01-02 0.590219 … Freq: D, dtype: float64 ——

九、时间序列数据的采样和画图

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
t_range = pd.date_range('2016-01-01','2016-12-01')
t_range
DatetimeIndex([‘2016-01-01’, ‘2016-01-02’, ‘2016-01-03’, ‘2016-01-04’, ‘2016-01-05’, ‘2016-01-06’, ‘2016-01-07’, ‘2016-01- ….)
s1 = Series(np.random.randn(len(t_range)),index=t_range)
s1.head()
2016-01-01    0.145550
2016-01-02    2.444428
...
Freq: D, dtype: float64

采样

#  t_range 是每天一个数据,下面要求每月一个数据
# 要求 计算每个月的 平均值 
s1['2016-01'].mean()
-0.14446619115276657
# shift + tab 键查看方法说明
# M 月 H 小时
s1_month = s1.resample('M').mean()
s1_month.head()
2016-01-31   -0.144466
2016-02-29    0.083245
2016-03-31    0.204149

Freq: M, dtype: float64

数据填充

# ffill() 往前填充
s1_test = s1.resample('H').ffill()
s1_test.head()
2016-01-01 00:00:00 0.14555 2016-01-01 01:00:00 0.14555 2016-01-01 02:00:00 0.14555 2016-01-01 03:00:00 0.14555 2016-01-01 04:00:00 0.14555 Freq: H, dtype: float64
# bfill() 往后填充
s1_test = s1.resample('H').bfill()
s1_test.head()
2016-01-01 00:00:00    0.145550
2016-01-01 01:00:00    2.444428
2016-01-01 02:00:00    2.444428
2016-01-01 03:00:00    2.444428
2016-01-01 04:00:00    2.444428
Freq: H, dtype: float64

简单画图

t_range = pd.date_range('2016-01-01','2016-12-31',freq='H')
t_range
DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
               '2016-01-01 02:00:00', '2016-01-01 03:00:00',
               '2016-01-01 04:00:00', '2016-01-01 05:00:00',
               '2016-01-01 06:00:00', '2016-01-01 07:00:00',
               '2016-01-01 08:00:00', '2016-01-01 09:00:00',
               ...
         )
stock_df = DataFrame(index=t_range)
stock_df.head(10)
2016-01-01 00:00:00
2016-01-01 01:00:00
2016-01-01 02:00:00
2016-01-01 03:00:00
2016-01-01 04:00:00
2016-01-01 05:00:00
stock_df['data1'] = np.random.randint(80,160,size=len(t_range))
stock_df.head(10)
data1
2016-01-01 00:00:00147
2016-01-01 01:00:00147
2016-01-01 02:00:00159
2016-01-01 03:00:0099
2016-01-01 04:00:00119
2016-01-01 05:00:00136
2016-01-01 06:00:00102
stock_df['data2'] = np.random.randint(40,140,size=len(t_range))
stock_df.head(10)
data1data2
2016-01-01 00:00:0014744
2016-01-01 01:00:0014793
2016-01-01 02:00:0015981
2016-01-01 03:00:0099108
2016-01-01 04:00:0011955
2016-01-01 05:00:00136106
stock_df.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x249d416d438>

import matplotlib.pyplot as plt
plt.show()

按周取样再绘图

weekly_df = DataFrame()
weekly_df['data1'] = stock_df['data1'].resample('W').mean()
weekly_df['data2'] = stock_df['data2'].resample('W').mean()
weekly_df.head()
data1data2
2016-01-03119.00000085.111111
2016-01-10119.09523890.345238
weekly_df.plot()
plt.show()


十、数据分箱技术-Binning

什么是数据分箱?

例如 不同大小的苹果分装在不同的箱子

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
score_list = np.random.randint(0,100,size=20)
score_list
array([87, 83, 40, 14, 4, 31, 46, 17, 96, 99, 39, 4, 85, 8, 64, 82, 32, 28, 36, 40])
bins = [0,59,70,80,100]

数据分箱

# cut(array,list)  将array里的数据按list区间划分 
score_cut = pd.cut(score_list,bins)
score_cut
[(80, 100], (80, 100], (0, 59], (0, 59], (0, 59], …, (80, 100], (0, 59], (0, 59], (0, 59], (0, 59]] Length: 20 Categories (4, interval[int64]): [(0, 59]
# value_counts() 统计各个 数值区间的 个数
pd.value_counts(score_cut)
(0, 59] 13 (80, 100] 6 (59, 70] 1 (70, 80] 0 dtype: int64
df = DataFrame()
df['score'] = score_list
df['student'] = [pd.util.testing.rands(3) for i in range(20)]
df.head()
scorestudent
087w5S
183cQX
240JYH
314Cs3
44niX

数据分箱 cut()函数

cut(数据,区间-list,labels=[‘各区间的表示标签’,’…’])


df['Categories'] = pd.cut(df['score'],bins,labels=['Low','OK','Good','Great'])
df.head()
scorestudentCategories
087w5SGreat
183cQXGreat
240JYHLow
314Cs3Low
44niXLow

十一、数据分组技术-GroupBy

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

原理类似SQL

df = pd.read_csv('weather.csv')
df.head()
datecitytemperaturewind
02017/1/1GZ123
12017/1/8BJ25
22017/1/15SH-46
32017/1/22GZ61
42017/1/29BJ133

分组

g = df.groupby(df['city'])
g
# 查看分组情况  哪一列 哪些组
g.groups
{‘BJ’: Int64Index([1, 4, 7, 10, 13, 16, 19, 22, 25, 28], dtype=’int64’), ‘GZ’: Int64Index([0, 3, 6, 9, 12, 15, 18, 21, 24, 27], dtype=’int64’), ‘SH’: Int64Index([2, 5, 8, 11, 14, 17, 20, 23, 26], dtype=’int64’)}
#查看某组
g_gz = g.get_group('GZ')
g_gz.head()
datecitytemperaturewind
02017/1/1GZ123
32017/1/22GZ61
62017/2/12GZ125
92017/3/5GZ69
122017/3/26GZ126
# 求gz的平均值
g_gz.mean()
temperature 9.0 wind 4.5 dtype: float64
# 求整个组的平均值
g.mean()
temperaturewind
city
BJ7.5000004.5
GZ9.0000004.5
SH-1.3333334.0

十二、数据聚合技术Aggregation

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
df = pd.read_csv('weather.csv')
g = df.groupby('city')

聚合

mean() count() max() 等都是聚合

# g.agg('min')  就是 g.min()  其他类似
g.agg('min')
datetemperaturewind
city
BJ2017/1/2921
GZ2017/1/161
SH2017/1/15-41

自定义函数 来聚合

def foo(attr):
    return attr.max() - attr.min()
g.agg(foo)
temperaturewind
city
BJ118
GZ68
SH66

多columns分组

g_new = df.groupby(['city','wind'])
g_new.groups
{(‘BJ’, 1): Int64Index([13], dtype=’int64’), (‘BJ’, 2): Int64Index([25], dtype=’int64’), (‘BJ’, 3): Int64Index([4, 10], dtype=’int64’), (‘BJ’, 4): Int64Index([7], dtype=’int64’), ……}
g_new.get_group(('BJ',3))
datecitytemperaturewind
42017/1/29BJ133
102017/3/12BJ133

对多 columns 遍历需要注意

for (name_1,name_2),group in g_new:
    print(name_1,name_2)
    print(group)
BJ 1
        date city  temperature  wind
13  2017/4/2   BJ            2     1
BJ 2
         date city  temperature  wind
25  2017/6/25   BJ            2     2

…..


十三、透视表

透视表概念来自Excel,为了更好展示数据,对原来的行列进行了一些变化,根据这些变化,相应的数据也变化

import numpy as np
import pandas as pd
from pandas import Series,DataFrame

无数据源 不演示了

# 读取excel文件数据
df = pd.read_excel()
# 生成数据透视表,index 对哪一列进行透视
# values 接收一些想要说的数据
pd.pivot_table(df,index=[''],...)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值