xtrabackup-备份时候导致主从io线程断开

原创已于 2024-08-14 18:11:40 修改 · 627 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#数据库

于 2024-08-14 16:16:17 首次发布

mysql 专栏收录该内容

58 篇文章

订阅专栏

一、背景

今天突然遇到了一个复制io线程出错的报警，于是立马上去查看，可以发现复制没有任何报错，但是复制io线程停止了，这时候依据经验，可以想到是备份造成的，那么我们详细看下原因

二、详细排查

1.具体备份语句

这里我们注意以下几个重点

safe-slave-backup：为保证备份的数据一致性，在innodb备份完成后会停止同步的io线程，这是io停止的主要因素
pv -qL 20000k 一定程度控制传输速度在20M/s

innobackupex   --slave-info --safe-slave-backup --safe-slave-backup-timeout=3600 --throttle=500 --stream=xbstream --kill-long-queries-timeout=120 | pv -qL 20000k | qpress -io

2.backup的日志

通过下面我们可以看到，在停止复制线程后，经过了16分钟左右的时候才打开锁，启动sql线程，而前后的LSN由17409579952330 -> 17410356759779,可以看到这段时间的变更还是比较多，那么这段省略号的时间是在复制redo-log的过程

240814 13:59:53 Executing FLUSH NO_WRITE_TO_BINLOG ENGINE LOGS...
xtrabackup: The latest check point (for incremental): '17409579952330'
xtrabackup: Stopping log copying thread.
.......................................................................................................................................................................................................................................................................................................................240814 14:15:41 >> log scanned up to (17410356759779)
240814 14:15:41 >> log scanned up to (17410356759779)

240814 14:15:41 Executing UNLOCK TABLES
240814 14:15:41 All tables unlocked
Starting slave SQL thread
240814 14:15:41 [00] Streaming ib_buffer_pool to <STDOUT>
240814 14:15:41 [00]        ...done

3.复制redo的源码

因为出问题的是2.4.8版本的，所以我们拿到2.4.8版本的代码进行分析
xtrabackup-2.4.8/storage/innobase/xtrabackup/src/xtrabackup.cc

/*可以看到这里有个检测是否停止拷贝线程的事件，如果log_copying_stop一直跑就一直不停止*/
skip_last_cp:
	/* stop log_copying_thread */
	log_copying = FALSE;
	os_event_set(log_copying_stop);
	msg("xtrabackup: Stopping log copying thread.\n");
	while (log_copying_running) {
		msg(".");
		os_thread_sleep(200000); /*0.2 sec*/
	}
	msg("\n");


/*
分割线
*/



static
#ifndef __WIN__
void*
#else
ulint
#endif
log_copying_thread(
	void*	arg __attribute__((unused)))
{
	/*
	  Initialize mysys thread-specific memory so we can
	  use mysys functions in this thread.
	*/
	my_thread_init();

	ut_a(dst_log_file != NULL);

	log_copying_running = TRUE;

	while(log_copying) {
		os_event_reset(log_copying_stop);
		os_event_wait_time_low(log_copying_stop,
				       xtrabackup_log_copy_interval * 1000ULL,
				       0);
		if (log_copying) {
			if(xtrabackup_copy_logfile(log_copy_scanned_lsn,
						   FALSE)) {

				exit(EXIT_FAILURE);
			}
		}
	}

	/* last copying */
	if(xtrabackup_copy_logfile(log_copy_scanned_lsn, TRUE)) {

		exit(EXIT_FAILURE);
	}

	log_copying_running = FALSE;
	my_thread_end();
	os_thread_exit();

	return(0);
}


/* ...分割线.. */

static my_bool
xtrabackup_copy_logfile(lsn_t from_lsn, my_bool is_last)
{
	/* definition from recv_recovery_from_checkpoint_start() */
	log_group_t*	group;
	lsn_t		group_scanned_lsn;
	lsn_t		contiguous_lsn;

	ut_a(dst_log_file != NULL);

	/* read from checkpoint_lsn_start to current */
	contiguous_lsn = ut_uint64_align_down(from_lsn, OS_FILE_LOG_BLOCK_SIZE);

	/* TODO: We must check the contiguous_lsn still exists in log file.. */

	group = UT_LIST_GET_FIRST(log_sys->log_groups);

	while (group) {
		bool	finished;
		lsn_t	start_lsn;
		lsn_t	end_lsn;

		/* reference recv_group_scan_log_recs() */
		finished = false;

		start_lsn = contiguous_lsn;

		while (!finished) {

			end_lsn = start_lsn + RECV_SCAN_SIZE;

			xtrabackup_io_throttling();

			mutex_enter(&log_sys->mutex);

			log_group_read_log_seg(log_sys->buf,
					       group, start_lsn, end_lsn);

			 if (!xtrabackup_scan_log_recs(group, is_last,
				start_lsn, &contiguous_lsn, &group_scanned_lsn,
				from_lsn, &finished)) {
				goto error;
			 }

			mutex_exit(&log_sys->mutex);

			start_lsn = end_lsn;

		}

		group->scanned_lsn = group_scanned_lsn;

		msg_ts(">> log scanned up to (" LSN_PF ")\n",
		       group->scanned_lsn);

		group = UT_LIST_GET_NEXT(log_groups, group);

		/* update global variable*/
		log_copy_scanned_lsn = group_scanned_lsn;

		/* innodb_mirrored_log_groups must be 1, no other groups */
		ut_a(group == NULL);

		debug_sync_point("xtrabackup_copy_logfile_pause");

	}


	return(FALSE);

error:
	mutex_exit(&log_sys->mutex);
	ds_close(dst_log_file);
	msg("xtrabackup: Error: xtrabackup_copy_logfile() failed.\n");
	return(TRUE);
}

下面的是master的代码

备份主流程
 获取last lsn
复制redolog

void Redo_Log_Data_Manager::copy_func() {  
  // 初始化线程相关资源  
  my_thread_init();  
  // 创建一个THD对象，用于在错误日志中获取线程号  
  THD *thd = create_thd(false, false, true, 0, 0);  
  
  // 标记复制过程未中止  
  aborted = false;  
  // 如果注册了重做日志消费者，并且消费者可以前进，则更新消费者的LSN  
  if (xtrabackup_register_redo_log_consumer && redo_log_consumer_can_advance.load()) {  
    redo_log_consumer.advance(redo_log_consumer_cnx, reader.get_scanned_lsn());  
  }  
  
  bool finished; // 标记一次复制是否完成  
  lsn_t consumer_lsn = 0; // 消费者当前的LSN  
  // 循环复制日志，直到被中止或达到停止的LSN  
  while (!aborted && (stop_lsn == 0 || stop_lsn > reader.get_scanned_lsn())) {  
    // 执行I/O限制  
    xtrabackup_io_throttling();  
  
    // 尝试进行一次日志复制  
    if (!copy_once(false, &finished)) {  
      // 如果复制失败，设置错误标志并返回  
      error = true;  
      return;  
    }  
  
    // 如果注册了重做日志消费者且消费者可以前进  
    if (xtrabackup_register_redo_log_consumer && redo_log_consumer_can_advance.load()) {  
      // 如果归档日志监控器已就绪且日志状态已定位  
      if (archived_log_monitor.is_ready() && archived_log_state == ARCHIVED_LOG_POSITIONED) {  
        // 注销重做日志消费者并关闭连接  
        redo_log_consumer.deinit(redo_log_consumer_cnx);  
        mysql_close(redo_log_consumer_cnx);  
        xtrabackup_register_redo_log_consumer = false;  
      } else {  
        // 如果消费者的LSN与当前扫描的LSN不同，则更新消费者  
        if (consumer_lsn != reader.get_scanned_lsn()) {  
          consumer_lsn = reader.get_scanned_lsn();  
          redo_log_consumer.advance(redo_log_consumer_cnx, consumer_lsn);  
        }  
      }  
    }  
  
    // 如果一次复制完成  
    if (finished) {  
      // 记录日志  
      xb::info() << ">> 日志扫描至 (" << reader.get_scanned_lsn() << ")";  
  
      // 同步点调试  
      debug_sync_point("xtrabackup_copy_logfile_pause");  
  
      // 重置并等待事件，等待间隔由copy_interval决定  
      os_event_reset(event);  
      os_event_wait_time_low(event, std::chrono::milliseconds{copy_interval}, 0);  
    }  
  }  
  
  // 尝试进行最后一次复制，以确保所有日志都被处理  
  if (!aborted && !copy_once(true, &finished)) {  
    error = true;  
  }  
  
  // 销毁THD对象  
  destroy_thd(thd);  
  // 结束线程相关资源  
  my_thread_end();  
}  
  
// 获取最后一个检查点的LSN  
lsn_t Redo_Log_Data_Manager::get_last_checkpoint_lsn() const {  
  return last_checkpoint_lsn;  
}  
  
// 获取停止的LSN  
lsn_t Redo_Log_Data_Manager::get_stop_lsn() const {  
  return stop_lsn;  
}  
  
// 获取开始检查点的LSN  
lsn_t Redo_Log_Data_Manager::get_start_checkpoint_lsn() const {  
  return start_checkpoint_lsn;  
}  
  
// 获取当前扫描到的LSN  
lsn_t Redo_Log_Data_Manager::get_scanned_lsn() const {  
  return scanned_lsn;  
}  
  
// 设置复制间隔  
void Redo_Log_Data_Manager::set_copy_interval(ulint interval) {  
  copy_interval = interval;  
}

4.时间估算

通过下面的时间估算也可以进一步证明卡顿的时间为拷贝redolog的时间

240812 21:26:11 >> log scanned up to (17281318758861)
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
xtrabackup: xtrabackup_backup_func active copy theads count: 1 .
240812 21:26:25 >> log scanned up to (17281332494143)

# 17281332494143 - 17281318758861 = 13735282 （14s）
# 17410356759779 - 17409579952330 = 776807449

# 776807449/13735282 * 14 = 792s