告别数据孤岛:PyMySQL三招打通CSV、Excel与JSON数据流转
你是否还在为Python与MySQL间的数据格式转换焦头烂额?当业务需要快速导入用户CSV数据,或导出Excel报表时,低效的手动处理不仅浪费时间,还可能引入人为错误。本文将系统讲解如何使用PyMySQL实现CSV、Excel与JSON三种主流格式的无缝数据流转,包含完整代码示例与性能优化方案。读完本文,你将掌握从数据库查询到文件生成的全流程自动化技巧,彻底摆脱重复劳动。
核心模块与准备工作
PyMySQL的数据处理能力依赖于其核心连接模块与Python标准库的协同工作。基础架构包含:
- 连接层:pymysql/connections.py实现数据库连接管理,支持上下文管理器自动释放资源
- 游标层:pymysql/cursors.py提供数据检索接口,DictCursor可直接返回字段名映射
- 转换层:pymysql/converters.py处理MySQL与Python数据类型转换
安装依赖包:
pip install pymysql pandas openpyxl
基础连接代码模板:
import pymysql
# 使用上下文管理器确保连接安全释放
with pymysql.connect(
host="localhost",
user="your_user",
password="your_pass",
database="your_db",
charset="utf8mb4"
) as conn:
with conn.cursor(pymysql.cursors.DictCursor) as cursor:
cursor.execute("SELECT VERSION()")
print("MySQL服务器版本:", cursor.fetchone())
CSV格式:批量数据的高效流转
CSV作为通用数据交换格式,适合百万级数据的导入导出。PyMySQL结合Python标准csv模块可实现零依赖处理。
从MySQL导出为CSV
import csv
import pymysql
def export_to_csv(query, filename):
with pymysql.connect(host="localhost", user="root", password="", db="sales") as conn:
with conn.cursor(pymysql.cursors.DictCursor) as cursor:
cursor.execute(query)
# 获取字段名作为CSV表头
headers = [i[0] for i in cursor.description]
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=headers)
writer.writeheader()
# 分批获取数据,避免内存溢出
while True:
rows = cursor.fetchmany(1000)
if not rows:
break
writer.writerows(rows)
print(f"成功导出 {filename},共 {cursor.rowcount} 条记录")
# 导出2023年销售数据
export_to_csv("SELECT * FROM orders WHERE year=2023", "2023_sales.csv")
从CSV导入到MySQL
利用LOAD DATA LOCAL INFILE命令实现高性能导入:
def import_from_csv(table, filename):
with pymysql.connect(
host="localhost",
user="root",
password="",
db="sales",
local_infile=True # 必须启用本地文件加载
) as conn:
with conn.cursor() as cursor:
# 构造LOAD DATA命令
sql = f"""
LOAD DATA LOCAL INFILE '{filename}'
INTO TABLE {table}
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS # 跳过表头行
"""
cursor.execute(sql)
conn.commit()
print(f"成功导入 {cursor.rowcount} 条记录到 {table}")
import_from_csv("customers", "new_customers.csv")
Excel格式:业务报表的优雅呈现
对于需要交付给业务部门的报表,Excel格式更为友好。通过pandas库可实现复杂表格样式设置。
数据库查询结果生成Excel
import pandas as pd
import pymysql
def query_to_excel(query, filename, sheet_name="数据报表"):
# 使用pandas读取SQL查询结果
conn = pymysql.connect(host="localhost", user="root", password="", db="sales")
df = pd.read_sql(query, conn)
conn.close()
# 创建Excel写入器并设置格式
with pd.ExcelWriter(filename, engine='openpyxl') as writer:
df.to_excel(writer, sheet_name=sheet_name, index=False)
# 获取工作表对象进行格式调整
worksheet = writer.sheets[sheet_name]
# 设置表头加粗
header_font = openpyxl.styles.Font(bold=True)
for cell in worksheet[1]: # 第二行是表头
cell.font = header_font
# 自动调整列宽
for column_cells in worksheet.columns:
max_length = max(len(str(cell.value)) for cell in column_cells)
worksheet.column_dimensions[column_cells[0].column_letter].width = max_length + 2
print(f"Excel报表已生成: {filename}")
query_to_excel("SELECT * FROM monthly_sales WHERE year=2023", "2023年度销售报表.xlsx")
JSON格式:API数据的标准交换
JSON格式适合与前端交互或API数据传输,PyMySQL结合json模块可实现复杂嵌套结构的序列化。
导出嵌套JSON数据
import json
import pymysql
def export_nested_json(filename):
with pymysql.connect(host="localhost", user="root", password="", db="ecommerce") as conn:
with conn.cursor(pymysql.cursors.DictCursor) as cursor:
# 查询订单主表
cursor.execute("SELECT id, customer_id, order_date FROM orders WHERE status='completed'")
orders = cursor.fetchall()
# 为每个订单添加明细数据
for order in orders:
cursor.execute("""
SELECT product_id, quantity, price
FROM order_items
WHERE order_id = %s
""", (order['id'],))
order['items'] = cursor.fetchall()
# 添加客户信息
cursor.execute("""
SELECT name, email
FROM customers
WHERE id = %s
""", (order['customer_id'],))
order['customer'] = cursor.fetchone()
del order['customer_id'] # 移除冗余字段
# 写入JSON文件
with open(filename, 'w', encoding='utf-8') as f:
json.dump(orders, f, ensure_ascii=False, indent=2)
print(f"已导出 {len(orders)} 条订单数据到 {filename}")
export_nested_json("已完成订单详情.json")
导入JSON数据到数据库
def import_json_to_db(filename):
with open(filename, 'r', encoding='utf-8') as f:
data = json.load(f)
with pymysql.connect(host="localhost", user="root", password="", db="ecommerce") as conn:
with conn.cursor() as cursor:
# 禁用外键检查提高导入速度
cursor.execute("SET FOREIGN_KEY_CHECKS=0")
for item in data:
# 插入主表数据
cursor.execute("""
INSERT INTO products (id, name, category, price)
VALUES (%s, %s, %s, %s)
""", (item['id'], item['name'], item['category'], item['price']))
# 批量插入标签数据
if 'tags' in item:
tag_values = [(item['id'], tag) for tag in item['tags']]
cursor.executemany("""
INSERT INTO product_tags (product_id, tag)
VALUES (%s, %s)
""", tag_values)
conn.commit()
cursor.execute("SET FOREIGN_KEY_CHECKS=1")
print(f"成功导入 {len(data)} 条产品数据")
import_json_to_db("新产品数据.json")
性能优化与最佳实践
批量操作优化
- 使用
executemany()代替循环execute,降低网络往返 - 设置
conn.autocommit(True)减少事务提交开销 - 大文件处理采用分页查询:
LIMIT offset, batch_size
内存管理策略
# 流式读取大结果集避免内存溢出
def stream_large_query(query, batch_size=1000):
with pymysql.connect(host="localhost", user="root", password="", db="large_data") as conn:
with conn.cursor(pymysql.cursors.SSCursor) as cursor: # 使用流式游标
cursor.execute(query)
while True:
rows = cursor.fetchmany(batch_size)
if not rows:
break
process_batch(rows) # 批处理函数
stream_large_query("SELECT * FROM user_behavior_log")
错误处理机制
def safe_data_transfer():
try:
with pymysql.connect(host="localhost", user="root", password="", db="sales") as conn:
conn.begin() # 显式开启事务
# 核心数据操作
transfer_data(conn)
conn.commit()
print("数据传输成功")
except pymysql.MySQLError as e:
print(f"数据库错误: {e}")
if 'conn' in locals() and conn.open:
conn.rollback()
except Exception as e:
print(f"系统错误: {e}")
完整工作流示例
以下是一个从CSV导入→数据清洗→数据库存储→JSON导出的完整自动化流程:
import csv
json
pymysql
pandas as pd
def data_processing_pipeline(csv_path, json_output):
# 1. 读取CSV并清洗数据
df = pd.read_csv(csv_path)
df = df.drop_duplicates() # 去重
df['birth_date'] = pd.to_datetime(df['birth_date']) # 日期格式化
df = df[df['age'] > 18] # 过滤成年用户
# 2. 写入数据库
conn = pymysql.connect(host="localhost", user="root", password="", db="user_db")
df.to_sql('cleaned_users', conn, if_exists='replace', index=False)
conn.close()
# 3. 按年龄段分组导出JSON
age_groups = {
'青年': df[df['age'].between(18, 30)],
'中年': df[df['age'].between(31, 50)],
'老年': df[df['age'] > 50]
}
with open(json_output, 'w', encoding='utf-8') as f:
json.dump({k: v.to_dict('records') for k, v in age_groups.items()}, f, ensure_ascii=False, indent=2)
print(f"完整流程完成,生成文件: {json_output}")
data_processing_pipeline("raw_users.csv", "用户分群数据.json")
扩展资源与工具推荐
- 官方文档:docs/source/user/examples.rst提供基础CRUD操作示例
- 性能测试:pymysql/tests/test_load_local.py包含本地文件加载的性能测试代码
- 第三方库:
pymysqlpool:数据库连接池实现,适合高并发场景sqlalchemy:ORM框架,提供更高级的数据抽象
- 监控工具:通过pymysql/connections.py中的debug日志可跟踪连接状态与查询性能
通过本文介绍的方法,你已经掌握了PyMySQL在三种主流数据格式间的转换技巧。这些工具不仅能提升日常工作效率,更能作为自动化数据管道的基础组件。根据实际业务需求选择合适的格式与方法,可大幅降低数据处理的复杂度,让Python与MySQL的协作更加顺畅。
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考



