Python数据处理常用的20个案例

原创于 2025-06-27 13:06:40 发布 · 442 阅读

8 ·

CC 4.0 BY-SA版权

文章标签：

#python #开发语言 #pip #Python学习 #学习 #数据处理 #Python数据

该文章已生成可运行项目，

Python凭借其简洁易懂的语法和丰富的第三方库，可高效的完成数据处理、数据分析等工作。本文将介绍20个Python在数据处理中常用的案例。

包含编程资料、学习路线图、源代码、软件安装包等！【[点击这里]】！

1. 数据读取

import pandas as pd

# 读取 CSV 文件
df = pd.read_csv('data.csv')

# 读取 Excel 文件
df = pd.read_excel('data.xlsx')

# 读取 JSON 文件
df = pd.read_json('data.json')

2. 数据查看

# 查看数据前几行
print(df.head())

# 查看数据信息
print(df.info())

# 查看数据统计信息
print(df.describe())

3. 数据清洗

# 删除重复行
df.drop_duplicates(inplace=True)

# 处理缺失值
df.fillna(0, inplace=True)

# 转换数据类型
df['date'] = pd.to_datetime(df['date'])

4. 数据选择

# 选择列
df = df[['name', 'age', 'city']]

# 选择行
df = df[df['age'] > 18]

# 选择特定单元格
value = df.loc[0, 'name']

5. 数据排序

# 按年龄升序排序
df.sort_values('age', inplace=True)

# 按年龄降序排序
df.sort_values('age', ascending=False, inplace=True)

6. 数据分组

# 按城市分组，并计算每个城市的平均年龄
grouped = df.groupby('city')['age'].mean()

7. 数据聚合

# 计算平均值
mean_age = df['age'].mean()

# 计算总和
total_sales = df['sales'].sum()

# 计算最大值
max_price = df['price'].max()

8. 数据转换

# 将年龄转换为字符串类型
df['age'] = df['age'].astype(str)

# 将城市名转换为大写
df['city'] = df['city'].str.upper()

9. 数据合并

# 合并两个 DataFrame
merged_df = pd.merge(df1, df2, on='id')

10. 数据导出

# 导出为 CSV 文件
df.to_csv('output.csv', index=False)

# 导出为 Excel 文件
df.to_excel('output.xlsx', index=False)

11. 字符串处理

# 查找包含特定字符串的行
df = df[df['name'].str.contains('John')]

# 替换字符串
df['city'] = df['city'].str.replace('New York', 'NYC')

12. 日期处理

# 获取年份
df['year'] = df['date'].dt.year

# 获取月份
df['month'] = df['date'].dt.month

# 计算日期差
df['days_diff'] = (df['end_date'] - df['start_date']).dt.days

13. 正则表达式

import re

# 匹配邮箱地址
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
df['is_valid_email'] = df['email'].str.match(email_pattern)

14. 数据可视化

import matplotlib.pyplot as plt

# 绘制柱状图
plt.bar(df['city'], df['population'])
plt.xlabel('City')
plt.ylabel('Population')
plt.title('City Population')
plt.show()

15. 统计分析

# 计算相关系数
correlation = df['age'].corr(df['income'])

# 进行 t 检验
from scipy import stats
t_statistic, p_value = stats.ttest_ind(df['group1'], df['group2'])

16. 机器学习

from sklearn.linear_model import LinearRegression

# 训练线性回归模型
model = LinearRegression()
model.fit(df[['feature1', 'feature2']], df['target'])

# 进行预测
predictions = model.predict(df[['feature1', 'feature2']])

17. 数据存储

# 存储到 MySQL 数据库
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="youruser",
  password="yourpassword",
  database="mydatabase")

mycursor = mydb.cursor()

for index, row in df.iterrows():
  sql = "INSERT INTO customers (name, age, city) VALUES (%s, %s, %s)"
  val = (row['name'], row['age'], row['city'])
  mycursor.execute(sql, val)

mydb.commit()

18. Web scraping

import requests
from bs4 import BeautifulSoup

# 发送请求
url = 'https://www.example.com'
response = requests.get(url)

# 解析 HTML
soup = BeautifulSoup(response.content, 'html.parser')

# 提取数据
title = soup.title.string

19. 自然语言处理

import nltk
from nltk.tokenize import word_tokenize

# 分词
text = "This is a sample sentence."
tokens = word_tokenize(text)

20. 时间序列分析

import pandas as pd

# 将日期设置为索引
df.set_index('date', inplace=True)

# 进行时间序列预测
from statsmodels.tsa.arima.model import ARIMA
model = ARIMA(df['value'], order=(5, 2, 0))
model_fit = model.fit()
predictions = model_fit.forecast(steps=10)