1.删除重复行
- 使用duplicated()函数检测重复的行
- 返回元素为布尔类型的Series对象
- 每个元素对应一行,如果该行不是第一次出现,则元素为True
- 参数keep:first(标记重复行第一次出现的位置)、last(标记重复行最后一次出现的位置,默认)、False(标记所有重复行的位置)
- 参数subset:在每一行对比时,只比较此列数据是否有重复值
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df = make_df([1,2,3,4],list('ABCD'))
df.loc[1] = df.loc[2]
print(df)
print('#####################################')
print(df.duplicated())
print('#####################################')
print(df.duplicated(keep='first'))
print('#####################################')
print(df.duplicated(keep='last'))
print('#####################################')
print(df.duplicated(keep=False))
print('#####################################')
print(df.duplicated(subset=['A','B']))
- 使用drop_duplicates()函数删除重复行
- 如果两行不完全重复,则无法直接删除,须使用subset参数
- 使用keep参数控制保留哪一数据
import numpy as np
import pandas as pd
def make_df(indexs,columns):
data = [[str(j)+str(i) for j in columns] for i in indexs]
df = pd.DataFrame(data=data,index=indexs,columns=columns)
return df
df = make_df([1,2,3,4],list('ABCD'))
df.loc[1] = df.loc[2]
df.loc[1,'D'] = 'F'
print(df)
print('#####################################')
print(df.drop_duplicates(subset=['A', 'B', 'C']))
print('#####################################')
print(df.drop_duplicates(subset=['A', 'B', 'C'],keep='last'))
2.数据映射
映射的含义:创建一个映射关系表,把values元素和一个特定的标签或字符串绑定
- replace()函数:替换元素
import numpy as np
import pandas as pd
data = np.array([
[60, 70, 80, 90],
[65, 75, 85, 95],
[62, 72, 82, 92],
[68, 78, 88, 98]
])
index = ['小明','小红','小王','小张']
columns = ['Python','Java','C++','C#']
df = pd.DataFrame(data=data,index=index,columns=columns)
print(df)
print('####################################')
df2 = df.replace({85:101,98:102})
print(df2)
- map()函数:适合处理某一单独的列,一般用在Series,不能用于DataFrame,可以使用lambda函数
import numpy as np
import pandas as pd
data = np.array([
[60, 70, 57, 90],
[65, 55, 78, 95],
[62, 93, 82, 92],
[68, 78, 99, 98]
])
index = ['小明','小红','小王','小张']
columns = ['Python','Java','C++','C#']
df = pd.DataFrame(data=data,index=index,columns=columns)
print(df)
print('####################################')
print(df['Python'].map({60:600}))
print('####################################')
print(df['Python'].map({60:600,65:650,62:620,68:680}))
print('####################################')
print(df['Python'].map(lambda x : x * 10))
print('####################################')
df['Pandas'] = df['Python'].map(lambda x : x * 10)
print(df)
print('####################################')
df['Java成绩是否及格'] = df['Java'].map(lambda n :'及格' if n>=60 else '不及格')
print(df)
print('####################################')
def fn(n):
if n < 60:
return '不及格'
elif n < 80:
return '及格'
else:
return '优秀'
df['C++等级'] = df['C++'].map(fn)
print(df)
- rename()函数:替换索引
- 默认修改行索引名
- 使用axis=1修改列索引名
- 使用index和columns参数控制修改行或者列索引名
import numpy as np
import pandas as pd
data = np.array([
[60, 70, 57, 90],
[65, 55, 78, 95],
[62, 93, 82, 92],
[68, 78, 99, 98]
])
index = ['小明','小红','小王','小张']
columns = ['Python','Java','C++','C#']
df = pd.DataFrame(data=data,index=index,columns=columns)
print(df)
print('####################################')
print(df.rename({'小明':'张三'}))
print('####################################')
print(df.rename({'Python':'JavaScript'},axis=1))
print('####################################')
print(df.rename({'Python':'JavaScript'},axis=1))
print('####################################')
print(df.rename(index={'小王':'李四'}))
print('####################################')
print(df.rename(columns={'Java':'JavaScript'}))
- reset_index()函数和set_index(keys=[''])函数:重置索引及设置行索引
- reset_index():重置索引,原行索引会变为一列数据,此列数据的列索引为index
- set_index(keys=['']):设置行索引,参数keys,设置某一列为行索引,原行索引消失
import numpy as np
import pandas as pd
data = np.array([
[60, 70, 57, 90],
[65, 55, 78, 95],
[62, 93, 82, 92],
[68, 78, 99, 98]
])
index = ['小明','小红','小王','小张']
columns = ['Python','Java','C++','C#']
df = pd.DataFrame(data=data,index=index,columns=columns)
print(df)
print('####################################')
print(df.reset_index())
print('####################################')
print(df.set_index(keys=['Java']))
- apply()函数:即支持Series,也支持DataFrame
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.random.randint(0,10,size=(5,3)),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
#用于Series,x为Series中的元素
print(df['Python'].apply(lambda x : True if x>5 else False))
print('#################################')
#用于DataFrame,可用axis控制行列,x是DataFrame中某列或某行的Series数据
print(df.apply(lambda x : x.mean(),axis=0))
print('#################################')
print(df.apply(lambda x : x.mean(),axis=1))
print('#################################')
#自定义方法
def fn(x):
return (np.round(x.mean(),1),x.count())
print(df.apply(fn,axis=1))
print('#################################')
#DataFrame专有方法,其中x是每个元素
print(df.applymap(lambda x : x + 100))
- transform()函数
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.random.randint(0,10,size=(5,3)),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
#用于Series
print(df['Python'].transform([np.sqrt,np.exp]))
print('#################################')
#用于DataFrame
def convert(x):
if x.mean() > 5 :
return x * 10
return x * (-10)
print(df.transform(convert)) #处理每一行
print('#################################')
print(df.transform(convert,axis=1)) #处理每一列
print('#################################')
3.异常值检测和过滤
- describe():查看每一列的描述性统计量
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.random.randint(0,10,size=(5,3)),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df.describe())
print('#################################')
print(df.describe([0.01,0.3,0.9]))
print('#################################')
#当列数过多时,可用.T转秩
print(df.describe([0.01,0.3,0.9]).T)
- std()函数:可以求得DataFrame对象每一列的标准差
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.random.randint(0,10,size=(5,3)),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df.std())
- drop()函数:删除特定索引
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.random.randint(0,10,size=(5,3)),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df.drop('A'))
print('#################################')
print(df.drop('Python',axis=1))
print('#################################')
print(df.drop(index='E'))
print('#################################')
print(df.drop(columns='Pandas'))
#删除多列多行
print(df.drop(index=['A','B'],columns=['Python','Numpy']))
- unique()函数:求唯一,去重
- DataFrame没有unique,得使用Series调用
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df['Python'].unique())
- query()函数:按条件查询
- ==、>、<
- and,&
- or,|
- in
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df.query('Python == 68'))
print('#################################')
print(df.query('Python > 62 and Numpy == 75'))
print('#################################')
print(df.query('Python > 62 & Numpy == 75'))
print('#################################')
print(df.query('Python > 62 or Numpy == 75'))
print('#################################')
print(df.query('Python in [68,62,90]'))
print('#################################')
#使用变量,前面需要加@
n = 60
print(df.query('Python == @n'))
- df.sort_values()函数和df.sort_index()函数:排序
- df.sort_values()函数:根据值排序
- 默认按照列名排序,默认升序
- 参数ascending:是否升序,默认True(升序),False(降序)
- df.sort_index()函数:根据索引排序(不常用)
- 默认是对行索引进行排序,默认是升序
- df.sort_values()函数:根据值排序
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
#df.sort_values()函数
print(df.sort_values('Python'))
print('#################################')
print(df.sort_values('Python',ascending=False))
print('#################################')
print(df.sort_values('A',axis=1,ascending=False))
print('#################################')
#df.sort_values()函数
print(df.sort_index(ascending=False))
print('#################################')
print(df.sort_index(ascending=False,axis=1))
- info()函数:查看数据信息
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df.info())
4.抽样
- 使用.take()函数排序
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
print(df.take([1,0,2])) #行排列
print('#################################')
print(df.take([1,0,2],axis=1)) #列排列
print('#################################')
print(df.take([1,0,2],axis=1)) #列排列
- 无放回抽样
- 借助np.random.permutation()函数随机排序
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
#随机排列
np = np.random.permutation([0,1,2])
#无放回抽样
print(df.take(np))
- 有放回抽样
import numpy as np
import pandas as pd
df = pd.DataFrame(
data=np.array([
[60, 70, 80],
[65, 75, 85],
[62, 72, 82],
[68, 78, 88],
[68, 78, 88],
]),
index=list('ABCDE'),
columns=['Python','Numpy','Pandas']
)
print(df)
print('#################################')
np = np.random.randint(0,3,size=5)
#无放回抽样
print(df.take(np))
知识点为听课总结笔记,课程为B站“千锋教育Pandas数据分析从入门到实战,零基础小白保姆级Python数据分析教程”:001_Pandas_Pandas介绍_哔哩哔哩_bilibili