Pandas数据操作与清洗实战指南-优快云博客

本文链接：https://blog.youkuaiyun.com/qq_40216188/article/details/125181457

pandas操作大全

一、pandas操作大全

一、pandas操作大全

1.1 导入数据

pd.read_csv(filename) # 从CSV文件 
pd.read_table(filename) # 从分隔的文本文件（例如CSV）中 
pd.read_excel(filename) # 从Excel文件 
pd.read_sql(query, connection_object) # 从SQL表/数据库中读取 
pd.read_json(json_string) # 从JSON格式的字符串，URL或文件中读取。
pd.read_html(url) # 解析html URL，字符串或文件，并将表提取到数据帧列表 
pd.read_clipboard() # 获取剪贴板的内容并将其传递给 read_table() 
pd.DataFrame(dict) # 从字典中，列名称的键，列表中的数据的值

1.2 导出数据

df.to_csv(filename) # 写入CSV文件 
df.to_excel(filename) # 写入Excel文件 
df.to_sql(table_name, connection_object) # 写入SQL表 
df.to_json(filename) # 以JSON格式写入文件

1.3 添加日期索引

df=pd.DataFrame(np.random.rand(5,5))               
# 添加日期索引，索引长度等于数据的行数
df.index = pd.date_range('1900/1/30', periods=df.shape[0]) 
df

在这里插入图片描述

1.4 查看、检查数据

n=5
df.head(n)                       # DataFrame的前n行 
df.tail(n)                       # DataFrame的最后n行 
df.shape                         # 行数和列数 
df.info()                        # 索引，数据类型和内存信息 
"""
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 1900-01-30 to 1900-02-03
Freq: D
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       5 non-null      float64
 1   1       5 non-null      float64
 2   2       5 non-null      float64
 3   3       5 non-null      float64
 4   4       5 non-null      float64
dtypes: float64(5)
memory usage: 240.0 bytes
"""
df.describe()                    # 数值列的摘要统计信息 
"""
0	       1		 	2			3		 	4
count	5.000000	5.000000	5.000000	5.000000	5.000000
mean	0.689966	0.426089	0.833323	0.401713	0.628914
std		0.322934	0.351482	0.061687	0.279294	0.326423
min		0.158050	0.126149	0.730606	0.053410	0.227080
25%		0.634945	0.158593	0.835841	0.161281	0.332720
50%		0.782703	0.294766	0.847640	0.490566	0.792833
75%		0.921796	0.586704	0.856627	0.646081	0.834723
max		0.952335	0.964232	0.895900	0.657225	0.957211
"""
df.iloc[:,0].value_counts(dropna=False)     # 查看唯一值和计数 
df.apply(pd.Series.value_counts) # 所有列的唯一值和计数
"""
			 0	 1	 2	 3	 4
0.053410	NaN	NaN	NaN	1.0	NaN
0.126149	NaN	1.0	NaN	NaN	NaN
0.158050	1.0	NaN	NaN	NaN	NaN
0.158593	NaN	1.0	NaN	NaN	NaN
0.161281	NaN	NaN	NaN	1.0	NaN
0.227080	NaN	NaN	NaN	NaN	1.0
0.294766	NaN	1.0	NaN	NaN	NaN
0.332720	NaN	NaN	NaN	NaN	1.0
0.490566	NaN	NaN	NaN	1.0	NaN
0.586704	NaN	1.0	NaN	NaN	NaN
0.634945	1.0	NaN	NaN	NaN	NaN
0.646081	NaN	NaN	NaN	1.0	NaN
0.657225	NaN	NaN	NaN	1.0	NaN
0.730606	NaN	NaN	1.0	NaN	NaN
0.782703	1.0	NaN	NaN	NaN	NaN
0.792833	NaN	NaN	NaN	NaN	1.0
0.834723	NaN	NaN	NaN	NaN	1.0
0.835841	NaN	NaN	1.0	NaN	NaN
0.847640	NaN	NaN	1.0	NaN	NaN
0.856627	NaN	NaN	1.0	NaN	NaN
0.895900	NaN	NaN	1.0	NaN	NaN
0.921796	1.0	NaN	NaN	NaN	NaN
0.952335	1.0	NaN	NaN	NaN	NaN
0.957211	NaN	NaN	NaN	NaN	1.0
0.964232	NaN	1.0	NaN	NaN	NaN
"""

1.5 数据选取

#使用这些命令选择数据的特定子集。
df[col]               # 返回带有标签col的列 
df[[col1, col2]]      # 返回列作为新的DataFrame 
s.iloc[0]             # 按位置选择 
s.loc['index_one']    # 按索引选择 
df.iloc[0,:]          # 第一行 
df.iloc[0,0]          # 第一栏的第一元素

1.6 数据清理

df.columns = ['a','b','c']                  # 重命名列 
pd.isnull()                                 # 空值检查，返回Boolean Arrray 
pd.notnull()                                # 与pd.isnull() 相反 
df.dropna()                                 # 删除所有包含空值的行 
df.dropna(axis=1)                           # 删除所有包含空值的列 
df.dropna(axis=1,thresh=n)                  # 删除所有具有少于n个非null值的行 
df.fillna(x)                                # 将所有空值替换为x 
s.fillna(s.mean())                          # 用均值替换所有空值（均值可以用统计模块中的几乎所有函数替换 ） 
s.astype(float)                             # 将系列的数据类型转换为float 
s.replace(1,'one')                          # 1 用 'one' 
s.replace([1,3],['one','three'])            # 替换所有等于的值 替换为所有1 'one' ，并 3 用 'three' df.rename(columns=lambda x: x + 1)          # 列的重命名 
df.rename(columns={'old_name': 'new_ name'})# 选择性重命名 
df.set_index('column_one')                  # 更改索引 
df.rename(index=lambda x: x + 1)            # 大规模重命名索引

1.7 筛选，排序和分组依据

df[df[col] &gt; 0.5]                      
# 列 col 大于 0.5 
df[(df[col] &gt; 0.5) &(df[col] &lt; 0.7)]  # 小于 0.7 大于0.5的行 
df.sort_values(col1)                   # 按col1升序对值进行排序 
df.sort_values(col2,ascending=False)   # 按col2 降序对值进行 排序 
df.sort_values([col1,col2],ascending=[True,False]) #按 col1 升序排序，然后 col2 按降序排序 
df.groupby(col)                        #从一个栏返回GROUPBY对象 
df.groupby([col1,col2]) # 返回来自多个列的groupby对象 
# 返回中的值的平均值 col2，按中的值分组 col1 （平均值可以用统计模块中的几乎所有函数替换 ）
df.groupby(col1)[col2]      
# 创建一个数据透视表组通过 col1 ，并计算平均值的 col2 和 col3 
df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean) 
df.groupby(col1).agg(np.mean)          # 在所有列中找到每个唯一col1 组的平均值 
df.apply(np.mean)                      #np.mean() 在每列上应用该函数 
df.apply(np.max,axis=1)                # np.max() 在每行上应用功能

1.8 数据合并

df1.append(df2)                   # 将df2添加 df1的末尾 （各列应相同） 
pd.concat([df1, df2],axis=1)      # 将 df1的列添加到df2的末尾 （行应相同） 
# 'how'可以是一个 'left'， 'right'， 'outer'， 'inner'
df1.join(df2,on=col1,how='inner')

1.9 数据统计

df.describe()    # 数值列的摘要统计信息 
df.mean()        # 返回均值的所有列 
df.corr()        # 返回DataFrame中各列之间的相关性 
df.count()       # 返回非空值的每个数据帧列中的数字 
df.max()         # 返回每列中的最高值 
df.min()         # 返回每一列中的最小值 
df.median()      # 返回每列的中位数 
df.std()         # 返回每列的标准偏差

1.10 数据清洗函数

# 导入数据集
import pandas as pd

df ={'姓名':[' 黄同学','黄至尊','黄老邪','陈大美','孙尚香'],
     '英文名':['Huang tong_xue','huang zhi_zun','Huang Lao_xie','Chen Da_mei','sun shang_xiang'],
     '性别':['男','women','men','女','男'],
     '身份证':['463895200003128433','429475199912122345','420934199110102311','431085200005230122','420953199509082345'],
     '身高':['mid:175_good','low:165_bad','low:159_bad','high:180_verygood','low:172_bad'],
     '家庭住址':['湖北广水','河南信阳','广西桂林','湖北孝感','广东广州'],
     '电话号码':['13434813546','19748672895','16728613064','14561586431','19384683910'],
     '收入':['1.1万','8.5千','0.9万','6.5千','2.0万']}
df = pd.DataFrame(df)
df

在这里插入图片描述

1.10.1 字符串的拼接（cat函数）

df["姓名"].str.cat(df["家庭住址"],sep='-'*3)
#结果
"""
0     黄同学---湖北广水
1     黄至尊---河南信阳
2    黄老邪 ---广西桂林
3     陈大美---湖北孝感
4     孙尚香---广东广州
Name: 姓名, dtype: object
"""

1.10.2 判断某个字符串是否包含给定字符（contains函数）

df["家庭住址"].str.contains("广")
"""
0     True
1    False
2     True
3    False
4     True
Name: 家庭住址, dtype: bool
"""

1.10.3 判断某个字符串是否以…开头/结尾（startswith/endswith函数）

# 第一个行的“ 黄伟”是以空格开头的
df["姓名"].str.startswith("黄") 
"""
0    False
1     True
2     True
3    False
4    False
Name: 姓名, dtype: bool
"""
df["英文名"].str.endswith("e")
"""
0     True
1    False
2     True
3    False
4    False
Name: 英文名, dtype: bool
"""

1.10.4 计算给定字符在字符串中出现的次数（count函数）

#计算给定字符在字符串中出现的次数，即字符3在电话号码中出现的次数
df["电话号码"].str.count("3")
"""
0    3
1    0
2    1
3    1
4    2
Name: 电话号码, dtype: int64
"""

1.10.5 获取指定位置的字符串（get函数）

#获取指定位置的字符串
df["姓名"].str.get(-1)
"""
0    学
1    尊
2    邪
3    美
4    香
Name: 姓名, dtype: object
"""
df["身高"].str.split(":")
"""
0         [mid, 175_good]
1          [low, 165_bad]
2          [low, 159_bad]
3    [high, 180_verygood]
4          [low, 172_bad]
Name: 身高, dtype: object
"""
df["身高"].str.split(":").str.get(0)
"""
0     mid
1     low
2     low
3    high
4     low
Name: 身高, dtype: object
"""

1.10.6 计算字符串长度（len函数）

#计算字符串长度
df["性别"].str.len()
"""
0    1
1    5
2    3
3    1
4    1
Name: 性别, dtype: int64
"""

1.10.7 英文大小写转换（upper/lower函数）

#英文大小写转换
df["英文名"].str.upper()
"""
0     HUANG TONG_XUE
1      HUANG ZHI_ZUN
2      HUANG LAO_XIE
3        CHEN DA_MEI
4    SUN SHANG_XIANG
Name: 英文名, dtype: object
"""
df["英文名"].str.lower()
"""
0     huang tong_xue
1      huang zhi_zun
2      huang lao_xie
3        chen da_mei
4    sun shang_xiang
Name: 英文名, dtype: object
"""

1.10.8 在字符串的左边、右边或左右两边添加给定字符（pad函数+side参数/center）

#在字符串的左边、右边或左右两边添加给定字符
df["家庭住址"].str.pad(10,fillchar="*")      # 相当于ljust()
"""
0    ******湖北广水
1    ******河南信阳
2    ******广西桂林
3    ******湖北孝感
4    ******广东广州
Name: 家庭住址, dtype: object
"""
df["家庭住址"].str.pad(10,side="right",fillchar="*")    # 相当于rjust()
"""
0    湖北广水******
1    河南信阳******
2    广西桂林******
3    湖北孝感******
4    广东广州******
Name: 家庭住址, dtype: object
"""
df["家庭住址"].str.center(10,fillchar="*")
"""
0    ***湖北广水***
1    ***河南信阳***
2    ***广西桂林***
3    ***湖北孝感***
4    ***广东广州***
Name: 家庭住址, dtype: object
"""

1.10.9 重复字符串几次（repeat函数）

#重复字符串几次
df["性别"].str.repeat(3)
"""
0                男男男
1    womenwomenwomen
2          menmenmen
3                女女女
4                男男男
Name: 性别, dtype: object
"""

1.10.10 使用给定的字符串，替换指定的位置的字符（slice_replace函数）

#使用给定的字符串，替换指定的位置的字符
df["电话号码"].str.slice_replace(4,8,"*"*4)
"""
0    1343****546
1    1974****895
2    1672****064
3    1456****431
4    1938****910
Name: 电话号码, dtype: object
"""

1.10.11 将指定位置的字符，替换为给定的字符串（replace函数）

#将指定位置的字符，替换为给定的字符串
df["身高"].str.replace(":","-")
"""
0         mid-175_good
1          low-165_bad
2          low-159_bad
3    high-180_verygood
4          low-172_bad
Name: 身高, dtype: object
"""

1.10.12 将字符串拆分（split函数+expand参数）

# 将指定位置的字符，替换为给定的字符串(接受正则表达式)
# replace中传入正则表达式，才叫好用；- 先不要管下面这个案例有没有用，你只需要知道，使用正则做数据清洗多好用；
df["收入"].str.replace("\d+\.\d+","正则")
"""
0    正则万
1    正则千
2    正则万
3    正则千
4    正则万
Name: 收入, dtype: object
"""

#搭配join方法功能很强大
# 普通用法
df["身高"].str.split(":")
"""
0         [mid, 175_good]
1          [low, 165_bad]
2          [low, 159_bad]
3    [high, 180_verygood]
4          [low, 172_bad]
Name: 身高, dtype: object
"""

# split方法，搭配expand参数
df[["身高描述","final身高"]] = df["身高"].str.split(":",expand=True)
df

在这里插入图片描述

# split方法搭配join方法
df["身高"].str.split(":").str.join("?"*5)
"""
0         mid?????175_good
1          low?????165_bad
2          low?????159_bad
3    high?????180_verygood
4          low?????172_bad
Name: 身高, dtype: object
"""

1.10.13 去除空白符、换行符（strip/rstrip/lstrip函数）

#去除空白符、换行符
df["姓名"].str.len()
df["姓名"] = df["姓名"].str.strip()
df["姓名"].str.len()

1.10.14 利用正则表达式，去字符串中匹配，返回查找结果的列表（findall函数）

# 利用正则表达式，去字符串中匹配，返回查找结果的列表
# findall使用正则表达式，做数据清洗，真的很香！
df["身高"]
"""
0         mid:175_good
1          low:165_bad
2          low:159_bad
3    high:180_verygood
4          low:172_bad
Name: 身高, dtype: object
"""
#寻找身高中所有的字母
df["身高"].str.findall("[a-zA-Z]+")
"""
0         [mid, good]
1          [low, bad]
2          [low, bad]
3    [high, verygood]
4          [low, bad]
Name: 身高, dtype: object
"""

1.10.15 接受正则表达式，抽取匹配的字符串(一定要加上括号)（extract/extractall函数）

#接受正则表达式，抽取匹配的字符串(一定要加上括号)，extract只匹配第一个匹配项
df["身高"].str.extract("([a-zA-Z]+)")
"""
0
0	mid
1	low
2	low
3	high
4	low
"""

# extractall提取得到复合索引，匹配所有匹配项
df["身高"].str.extractall("([a-zA-Z]+)")
"""
0
match	
0	0	mid
	1	good
1	0	low
	1	bad
2	0	low
	1	bad
3	0	high
	1	verygood
4	0	low
	1	bad
"""

# extract搭配expand参数
df["身高"].str.extract("([a-zA-Z]+).*?([a-zA-Z]+)",expand=True)
"""
	 0	  1
0	mid	 good
1	low	 bad
2	low	 bad
3	high verygood
4	low  bad
"""