数据库连接池耗尽导致服务雪崩
场景背景
某电商平台促销期间,订单服务突然大面积超时,前端报错 500 Internal Server Error,同时监控显示:
- Nginx 499(客户端主动断开)错误率飙升到 35%
- 订单服务平均响应时间从 50ms 暴涨至 8s
- 数据库连接池活跃连接数达上限(1000/1000)
根本原因分析
核心原因链:
-
商品查询接口新增 WHERE tags LIKE ‘%新品%’ 条件,导致全表扫描(800万行表)
-
ORM框架未正确关闭ResultSet,连接未返还连接池
-
线程池排队队列过长(2000+),触发Tomcat线程饥饿
一. 复现步骤(测试环境)
1. 创建目录与配置文件
mkdir -p /data/mysql-failure/{mysql,app,sql-scripts} && cd /data/mysql-failure
cat > mysql/my.cnf <<EOF
[mysqld]
max_connections = 10 # 限制最大连接数
slow_query_log = 1 # 开启慢查询日志
long_query_time = 5 # 超时阈值(5秒)
innodb_buffer_pool_size = 128M # 限制内存(制造磁盘IO)
log_queries_not_using_indexes = 1 # 记录未使用索引的查询
EOF
2. 启动 MySQL 容器
docker run -d --name=mysql \
-v $(pwd)/mysql:/etc/mysql/conf.d \
-e MYSQL_ROOT_PASSWORD=123456 \
-p 3306:3306 mysql:8.0
# 校验容器状态
docker ps | grep mysql || echo "[ERROR] MySQL 启动失败"
3. 创建sql文件
cat > sql-scripts/init.sql <<'EOF'
CREATE DATABASE IF NOT EXISTS order_db;
use order_db;
-- 创建数据表
CREATE TABLE `user_orders` (
`id` BIGINT UNSIGNED NOT NULL AUTO_INCREMENT,
`user_id` INT NOT NULL,
`product_id` SMALLINT NOT NULL,
`order_status` TINYINT NOT NULL COMMENT '0:未支付 1:已支付 2:已取消',
`amount` DECIMAL(10,2) NOT NULL,
`create_time` DATETIME NOT NULL,
`update_time` DATETIME NOT NULL,
`remark` VARCHAR(200) DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- 创建数据存储过程,用于初始化大量数据
DELIMITER $$
CREATE PROCEDURE GenerateOrderData()
BEGIN
DECLARE i INT DEFAULT 0;
DECLARE batch_size INT DEFAULT 1000; -- 每批插入量
DECLARE max_records INT DEFAULT 50000000; -- 0.5亿条记录
WHILE i < max_records DO
INSERT INTO user_orders (
user_id,
product_id,
order_status,
amount,
create_time,
update_time,
remark
)
SELECT
FLOOR(RAND() * 1000000), -- 100万用户
FLOOR(RAND() * 500 + 1), -- 500种商品
FLOOR(RAND() * 3), -- 3种状态
ROUND(RAND() * 1000, 2), -- 0~1000随机金额
DATE_ADD('2020-01-01', INTERVAL FLOOR(RAND() * 1460) DAY), -- 4年时间范围
DATE_ADD('2020-01-01', INTERVAL FLOOR(RAND() * 1460) DAY),
CONCAT('Remark_', UUID())
FROM
(SELECT 1 UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION SELECT 5) a,
(SELECT 1 UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION SELECT 5) b,
(SELECT 1 UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION SELECT 5) c,
(SELECT 1 UNION SELECT 2 UNION SELECT 3 UNION SELECT 4 UNION SELECT 5) d
LIMIT batch_size;
SET i = i + batch_size;
END WHILE;
END
$$
DELIMITER ;
EOF
4.执行sql文件
docker cp sql-scripts/init.sql mysql:/init.sql
docker exec -it mysql mysql -uroot -p123456 -e "source /init.sql; CALL GenerateOrderData();"
# 校验数据量
docker exec -it mysql mysql -uroot -p123456 -e "USE order_db; SELECT COUNT(*) FROM user_orders;"
5.部署 Spring Boot 应用 – 用idea部署好打包成jar包,docker运行
5.1 com.example.demo.Application
package com.example.demo;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.web.bind.annotation.*;
import javax.sql.DataSource;
import java.sql.*;
@SpringBootApplication
@RestController
public class Application {
private final DataSource dataSource;
private String man_sql="SELECT \n" +
" id,\n" +
" user_id,\n" +
" (SELECT MAX(amount) \n" +
" FROM user_orders u2 \n" +
" WHERE u2.user_id = u1.user_id \n" +
" AND u2.create_time > DATE_SUB(u1.create_time, INTERVAL 90 DAY)) AS max_amount_in_90d,\n" +
" (SELECT GROUP_CONCAT(DISTINCT product_id) \n" +
" FROM user_orders u3 \n" +
" WHERE u3.user_id = u1.user_id \n" +
" AND u3.remark LIKE CONCAT('%', SUBSTRING(u1.remark, 5, 3), '%')) AS related_products\n" +
"FROM user_orders u1\n" +
"WHERE create_time BETWEEN '2020-01-01' AND '2021-12-31'\n" +
"AND EXISTS (\n" +
" SELECT 1 \n" +
" FROM user_orders u3 \n" +
" WHERE u3.user_id = u1.user_id \n" +
" AND u3.remark LIKE CONCAT('%', SUBSTRING(u1.remark, 5, 3), '%')\n" +
")\n" +
"ORDER BY LENGTH(SUBSTRING_INDEX(SUBSTRING_INDEX(remark, '_', 2), '_', -1)) DESC\n" +
"LIMIT 10;";
public Application(DataSource dataSource) {
this.dataSource = dataSource;
}
// 场景1:连接泄漏(未关闭)
@GetMapping("/leak")
public String leak() throws SQLException {
Connection conn = dataSource.getConnection(); // 故意不关闭
return "Leaked 1 connection!";
}
// 场景2:真实慢SQL
@GetMapping("/slow")
public String slow() throws SQLException {
try (Connection conn = dataSource.getConnection();
Statement stmt = conn.createStatement()) {
ResultSet rs = stmt.executeQuery(man_sql);
return rs.getArray(1).toString();
}
}
// 场景3:连接创建并关闭
@GetMapping("/leak_close")
public String leak_close() throws SQLException {
Connection conn = null;
try {
conn = dataSource.getConnection();
} catch (SQLException e) {
throw new RuntimeException(e);
} finally {
conn.close();
}
return "create one Connected and close";
}
public static void main(String[] args) {
SpringApplication.run(Application.class, args);
}
}
5.2 application.yml
spring:
datasource:
url: jdbc:mysql://192.168.1.46:3306/order_db?useSSL=false&serverTimezone=UTC&characterEncoding=utf8
username: root
password: 123456
driver-class-name: com.mysql.cj.jdbc.Driver
hikari:
maximum-pool-size: 10
minimum-idle: 2
idle-timeout: 60000
connection-timeout: 30000
max-lifetime: 600000
jackson:
time-zone: GMT+8
5.3 pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.example.demo</groupId>
<artifactId>leak_out</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.7.18</version>
</parent>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-jdbc</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>
5.4 自行打包好jar包传入服务器,编排docker
5.4.1 jar包放入服务器
mv leak_out-1.0-SNAPSHOT.jar /data/mysql-failure/app/leak_out.jar
cd /data/mysql-failure/app
5.4.2 创建Dockerfile文件
cat > Dockerfile <<EOF
FROM openjdk:8-jdk
WORKDIR /app
CMD ["java", "-jar", "leak_out.jar"]
EOF
5.4.3 创建docker-compose.yml文件
cat <<EOF > docker-compose.yml
version: '3.8'
services:
leak-out-app:
build: .
container_name: leak-out-container
ports:
- "5580:8080"
volumes:
- /data/mysql-failure/app/leak_out.jar:/app/leak_out.jar
restart: unless-stopped
EOF
5.4.4 启动容器
docker-compose up -d
5.5 测试
5.5.1 正常测试
curl -s http://192.168.1.46:5580/leak >/dev/null &
curl -s http://192.168.1.46:5580/slow >/dev/null &
curl -s http://192.168.1.46:5580/leak_close >/dev/null &
5.5.2 模拟正常调用
for i in {1..10}; do
curl -s http://192.168.1.46:5580/leak_close >/dev/null &
sleep 2
done
# 校验连接数 (应连接数正常,不会耗尽)
docker exec -it mysql mysql -uroot -p123456 -e "SHOW STATUS LIKE 'Threads_connected'" | grep -v 1
5.5.3 模拟连接泄漏(快速耗尽连接池)
for i in {1..10}; do
curl -s http://192.168.1.46:5580/leak >/dev/null &
sleep 2
done
# 校验连接数 (应连接数10左右)
docker exec -it mysql mysql -uroot -p123456 -e "SHOW STATUS LIKE 'Threads_connected'" | grep -v 1
5.5.4 触发慢查询阻塞(占用剩余连接)
for i in {1..10}; do
curl -s http://192.168.1.46:5580/slow >/dev/null &
sleep 2
done
# 查看慢日志 (应有全表扫描记录)
docker exec mysql grep "SELECT" /var/lib/mysql/$(hostname)-slow.log
注意:
1. 开启慢查询后,如果my.cnf配置里没指定,则可以在 /var/lib/mysql/$(hostname)-slow.log 可以查询慢sql日志
2. 查看慢查询日志配置,mysql终端内部使用
##mysql终端内部使用
# 查看慢查询配置
SHOW VARIABLES LIKE '%slow_query%';
SHOW VARIABLES LIKE 'long_query_time';
# 查看慢查询日志路径
SHOW VARIABLES LIKE 'slow_query_log_file';
# 重置慢查询日志(需要时使用)
SET GLOBAL slow_query_log = 'OFF';
SET GLOBAL slow_query_log = 'ON';
3. 慢查询日志示例:
# Time: 2025-07-07T10:12:10.927209Z
# User@Host: root[root] @ [112.94.5.80] Id: 9
# Query_time: 41.205736 Lock_time: 0.000008 Rows_sent: 40219 Rows_examined: 31250000
use order_db;
SET timestamp=1751883089;
SELECT * FROM user_orders
WHERE remark LIKE '%abc%';
二 排查
1. 定位故障
1.1 检查服务器性能
# 检查服务器cpu,内存
htop
1.2 检查数据库连接数
netstat -ant | grep 3306 | wc -l # 数据库连接数超过max_connections
注意:
- 检查max_connections
mysql> SHOW VARIABLES like ‘%max_connections%’;
1.3 应急解决办法
mysql> SET GLOBAL max_connections=100; -- 临时扩容连接池
mysql> SHOW VARIABLES like '%max_connections%'; -- 检查是否配置成功
2. 分析故障
2.1 抓取数据库当前线程
mysql> SHOW FULL PROCESSLIST;
注意:
- 根据id,删除sleep状态且time时间很长的process
kill id;
2.2 解析MySQL慢日志
mysqldumpslow -s t /var/lib/mysql/$(hostname)-slow.log
注意:
- 通过explain分析sql有没有走索引,记录sql,kill掉慢sql,然后优化sql,添加索引后,再发起sql
3. 优化
3.1 优化mysql配置
# my.cnf 配置
[mysqld]
wait_timeout = 180 # 非交互连接超时3分钟
interactive_timeout = 180 # 交互连接超时3分钟
max_connections = 200 # 限制最大连接数
3.2 客户端连接池优化
# Spring Boot 配置示例
spring:
datasource:
hikari:
max-lifetime: 1800000 # 最大存活时间(30分钟)
idle-timeout: 180000 # 3分钟空闲超时
leak-detection-threshold: 180000 # 3分钟连接泄露检测
3.3 代码层面优化
# 连接用完后关闭回收
public String slow() throws SQLException {
Connection conn = null;
Statement stmt = null;
ResultSet rs = null;
try {
conn = dataSource.getConnection();
stmt = conn.createStatement();
rs = stmt.executeQuery(man_sql);
return rs.getArray(1).toString();
} catch (Exception e) {
e.printStackTrace();
}finally {
rs.close();
stmt.close();
conn.close();
return null;
}
}