一、背景
今天突然遇到了一个复制io线程出错的报警,于是立马上去查看,可以发现复制没有任何报错,但是复制io线程停止了,这时候依据经验,可以想到是备份造成的,那么我们详细看下原因
二、详细排查
1.具体备份语句
这里我们注意以下几个重点
- safe-slave-backup:为保证备份的数据一致性,在innodb备份完成后会停止同步的io线程,这是io停止的主要因素
- pv -qL 20000k 一定程度控制传输速度在20M/s
innobackupex --slave-info --safe-slave-backup --safe-slave-backup-timeout=3600 --throttle=500 --stream=xbstream --kill-long-queries-timeout=120 | pv -qL 20000k | qpress -io
2.backup的日志
通过下面我们可以看到,在停止复制线程后,经过了16分钟左右的时候才打开锁,启动sql线程,而前后的LSN由17409579952330 -> 17410356759779,可以看到这段时间的变更还是比较多,那么这段省略号的时间是在复制redo-log的过程
240814 13:59:53 Executing FLUSH NO_WRITE_TO_BINLOG ENGINE LOGS...
xtrabackup: The latest check point (for incremental): '17409579952330'
xtrabackup: Stopping log copying thread.
.......................................................................................................................................................................................................................................................................................................................240814 14:15:41 >> log scanned up to (17410356759779)
240814 14:15:41 >> log scanned up to (17410356759779)
240814 14:15:41 Executing UNLOCK TABLES
240814 14:15:41 All tables unlocked
Starting slave SQL thread
240814 14:15:41 [00] Streaming ib_buffer_pool to <STDOUT>
240814 14:15:41 [00] ...done
3.复制redo的源码
因为出问题的是2.4.8版本的,所以我们拿到2.4.8版本的代码进行分析
xtrabackup-2.4.8/storage/innobase/xtrabackup/src/xtrabackup.cc
/*可以看到这里有个检测是否停止拷贝线程的事件,如果log_copying_stop一直跑就一直不停止*/
skip_last_cp:
/* stop log_copying_thread */
log_copying = FALSE;
os_event_set(log_copying_stop);
msg("xtrabackup: Stopping log copying thread.\n");
while (log_copying_running) {
msg(".");
os_thread_sleep(200000); /*0.2 sec*/
}
msg("\n");
/*
分割线
*/
static
#ifndef __WIN__
void*
#else
ulint
#endif
log_copying_thread(
void* arg __attribute__((unused)))
{
/*
Initialize mysys thread-specific memory so we can
use mysys functions in this thread.
*/
my_thread_init();
ut_a(dst_log_file != NULL);
log_copying_running = TRUE;
while(log_copying) {
os_event_reset(log_copying_stop);
os_event_wait_time_low(log_copying_stop,
xtrabackup_log_copy_interval * 1000ULL,
0);
if (log_copying) {
if(xtrabackup_copy_logfile(log_copy_scanned_lsn,
FALSE)) {
exit(EXIT_FAILURE);
}
}
}
/* last copying */
if(xtrabackup_copy_logfile(log_copy_scanned_lsn, TRUE)) {
exit(EXIT_FAILURE);
}
log_copying_running = FALSE;
my_thread_end();
os_thread_exit();
return(0);
}
/* ...分割线.. */
static my_bool
xtrabackup_copy_logfile(lsn_t from_lsn, my_bool is_last)
{
/* definition from recv_recovery_from_checkpoint_start() */
log_group_t* group;
lsn_t group_scanned_lsn;
lsn_t contiguous_lsn;
ut_a(dst_log_file != NULL);
/* read from checkpoint_lsn_start to current */
contiguous_lsn = ut_uint64_align_down(from_lsn, OS_FILE_LOG_BLOCK_SIZE);
/* TODO: We must check the contiguous_lsn still exists in log file.. */
group = UT_LIST_GET_FIRST(log_sys->log_groups);
while (group) {
bool finished;
lsn_t start_lsn;
lsn_t end_lsn;
/* reference recv_group_scan_log_recs() */
finished = false;
start_lsn = contiguous_lsn;
while (!finished) {
end_lsn = start_lsn + RECV_SCAN_SIZE;
xtrabackup_io_throttling();
mutex_enter(&log_sys->mutex);
log_group_read_log_seg(log_sys->buf,
group, start_lsn, end_lsn);
if (!xtrabackup_scan_log_recs(group, is_last,
start_lsn, &contiguous_lsn, &group_scanned_lsn,
from_lsn, &finished)) {
goto error;
}
mutex_exit(&log_sys->mutex);
start_lsn = end_lsn;
}
group->scanned_lsn = group_scanned_lsn;
msg_ts(">> log scanned up to (" LSN_PF ")\n",
group->scanned_lsn);
group = UT_LIST_GET_NEXT(log_groups, group);
/* update global variable*/
log_copy_scanned_lsn = group_scanned_lsn;
/* innodb_mirrored_log_groups must be 1, no other groups */
ut_a(group == NULL);
debug_sync_point("xtrabackup_copy_logfile_pause");
}
return(FALSE);
error:
mutex_exit(&log_sys->mutex);
ds_close(dst_log_file);
msg("xtrabackup: Error: xtrabackup_copy_logfile() failed.\n");
return(TRUE);
}
下面的是master的代码
void Redo_Log_Data_Manager::copy_func() {
// 初始化线程相关资源
my_thread_init();
// 创建一个THD对象,用于在错误日志中获取线程号
THD *thd = create_thd(false, false, true, 0, 0);
// 标记复制过程未中止
aborted = false;
// 如果注册了重做日志消费者,并且消费者可以前进,则更新消费者的LSN
if (xtrabackup_register_redo_log_consumer && redo_log_consumer_can_advance.load()) {
redo_log_consumer.advance(redo_log_consumer_cnx, reader.get_scanned_lsn());
}
bool finished; // 标记一次复制是否完成
lsn_t consumer_lsn = 0; // 消费者当前的LSN
// 循环复制日志,直到被中止或达到停止的LSN
while (!aborted && (stop_lsn == 0 || stop_lsn > reader.get_scanned_lsn())) {
// 执行I/O限制
xtrabackup_io_throttling();
// 尝试进行一次日志复制
if (!copy_once(false, &finished)) {
// 如果复制失败,设置错误标志并返回
error = true;
return;
}
// 如果注册了重做日志消费者且消费者可以前进
if (xtrabackup_register_redo_log_consumer && redo_log_consumer_can_advance.load()) {
// 如果归档日志监控器已就绪且日志状态已定位
if (archived_log_monitor.is_ready() && archived_log_state == ARCHIVED_LOG_POSITIONED) {
// 注销重做日志消费者并关闭连接
redo_log_consumer.deinit(redo_log_consumer_cnx);
mysql_close(redo_log_consumer_cnx);
xtrabackup_register_redo_log_consumer = false;
} else {
// 如果消费者的LSN与当前扫描的LSN不同,则更新消费者
if (consumer_lsn != reader.get_scanned_lsn()) {
consumer_lsn = reader.get_scanned_lsn();
redo_log_consumer.advance(redo_log_consumer_cnx, consumer_lsn);
}
}
}
// 如果一次复制完成
if (finished) {
// 记录日志
xb::info() << ">> 日志扫描至 (" << reader.get_scanned_lsn() << ")";
// 同步点调试
debug_sync_point("xtrabackup_copy_logfile_pause");
// 重置并等待事件,等待间隔由copy_interval决定
os_event_reset(event);
os_event_wait_time_low(event, std::chrono::milliseconds{copy_interval}, 0);
}
}
// 尝试进行最后一次复制,以确保所有日志都被处理
if (!aborted && !copy_once(true, &finished)) {
error = true;
}
// 销毁THD对象
destroy_thd(thd);
// 结束线程相关资源
my_thread_end();
}
// 获取最后一个检查点的LSN
lsn_t Redo_Log_Data_Manager::get_last_checkpoint_lsn() const {
return last_checkpoint_lsn;
}
// 获取停止的LSN
lsn_t Redo_Log_Data_Manager::get_stop_lsn() const {
return stop_lsn;
}
// 获取开始检查点的LSN
lsn_t Redo_Log_Data_Manager::get_start_checkpoint_lsn() const {
return start_checkpoint_lsn;
}
// 获取当前扫描到的LSN
lsn_t Redo_Log_Data_Manager::get_scanned_lsn() const {
return scanned_lsn;
}
// 设置复制间隔
void Redo_Log_Data_Manager::set_copy_interval(ulint interval) {
copy_interval = interval;
}
4.时间估算
通过下面的时间估算也可以进一步证明卡顿的时间为拷贝redolog的时间
240812 21:26:11 >> log scanned up to (17281318758861)
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
240812 21:26:25 >> log scanned up to (17281332494143)
# 17281332494143 - 17281318758861 = 13735282 (14s)
# 17410356759779 - 17409579952330 = 776807449
# 776807449/13735282 * 14 = 792s
三、结论
在使用safe-slave-backup参数的时候要注意myisam表的大小,同步所有innodb表后的redo log的大小,即所有变更的频繁程度,这些都可能造成停止复制io的线程时间过长。
1821

被折叠的 条评论
为什么被折叠?



