6 InnoDB相关的数据结构

最新推荐文章于 2024-09-18 07:00:00 发布

tz_sz

最新推荐文章于 2024-09-18 07:00:00 发布

阅读量1.3k

点赞数

CC 4.0 BY-SA版权

分类专栏： mysql

本文链接：https://blog.youkuaiyun.com/taozhi20084525/article/details/17577153

mysql 专栏收录该内容

18 篇文章

订阅专栏

本文深入探讨了InnoDB存储引擎中的几个核心数据结构，包括srv_sys_t、fil_system_t、buf_pool_t和trx_sys_t。通过对这些结构的详细解析，揭示了InnoDB如何管理和操作数据库的线程、文件、缓冲池及事务。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

以下讨论的结构体或函数都是出现在innobase_start_or_create_for_mysql()中。

一、srv_sys_t* srv_sys——server system

typedef struct srv_sys_struct	srv_sys_t
/** The server system struct */
struct srv_sys_struct{
	srv_table_t*	threads;	/*!< server thread table */
	UT_LIST_BASE_NODE_T(que_thr_t)
			tasks;		/*!< task queue */  //这是一个链表，其中的节点que_thr_t称为Query graph query thread node。
};

在srv_init()中，srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t))。

typedef struct srv_slot_struct srv_slot_t;

/* Thread slot in the thread table */
struct srv_slot_struct{
	unsigned	type:1;		/*!< thread type: user, utility etc. */
	unsigned	in_use:1;	/*!< TRUE if this slot is in use */
	unsigned	suspended:1;	/*!< TRUE if the thread is waiting
					for the event of this slot */
	ib_time_t	suspend_time;	/*!< time when the thread was
					suspended */
	os_event_t	event;		/*!< event used in suspending the
					thread when it has nothing to do */
	que_thr_t*	thr;		/*!< suspended query thread (only
					used for MySQL threads) */
};

备注：该宏定义经常被用到：用于表示一个链表

#define UT_LIST_BASE_NODE_T(TYPE)\
struct {\
	ulint	count;	/*!< count of nodes in list */\
	TYPE *	start;	/*!< pointer to list start, NULL if empty */\
	TYPE *	end;	/*!< pointer to list end, NULL if empty */\
}\

二、fil_system_t* fil_system

该结构体的初始化为fil_init()；fil_open_log_and_system_tablespace_files()中打开共享表文件和redo logs。

typedef	struct fil_system_struct	fil_system_t;
/** The tablespace memory cache; also the totality of logs (the log
data space) is stored here; below we talk about tablespaces, but also
the ib_logfiles form a 'space' and it is handled here */

struct fil_system_struct {
#ifndef UNIV_HOTBACKUP
	mutex_t		mutex;		/*!< The mutex protecting the cache */
#endif /* !UNIV_HOTBACKUP */
	hash_table_t*	spaces;		/*!< The hash table of spaces in the
					system; they are hashed on the space
					id */
	hash_table_t*	name_hash;	/*!< hash table based on the space
					name */
	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
					/*!< base node for the LRU list of the
					most recently used open files with no
					pending i/o's; if we start an i/o on
					the file, we first remove it from this
					list, and return it to the start of
					the list when the i/o ends;
					log files and the system tablespace are
					not put to this list: they are opened
					after the startup, and kept open until
					shutdown */
	UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
					/*!< base node for the list of those
					tablespaces whose files contain
					unflushed writes; those spaces have
					at least one file node where
					modification_counter > flush_counter */
	ulint		n_open;		/*!< number of files currently open */
	ulint		max_n_open;	/*!< n_open is not allowed to exceed
					this */
	ib_int64_t	modification_counter;/*!< when we write to a file we
					increment this by one */
	ulint		max_assigned_id;/*!< maximum space id in the existing
					tables, or assigned during the time
					mysqld has been up; at an InnoDB
					startup we scan the data dictionary
					and set here the maximum of the
					space id's of the tables there */
	ib_int64_t	tablespace_version;
					/*!< a counter which is incremented for
					every space object memory creation;
					every space mem object gets a
					'timestamp' from this; in DISCARD/
					IMPORT this is used to check if we
					should ignore an insert buffer merge
					request */
	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
					/*!< list of all file spaces */
	ibool		space_id_reuse_warned;
					/* !< TRUE if fil_space_create()
					has issued a warning about
					potential space_id reuse */
};

注意：其中对LRU域的描述是：redo log和共享表空间时钟被打开，所以并不放入LRU中；因此LRU是针对独立表空间的，不过本例未使用独立表空间，所以此处的LRU实际上并没有使用。

/** File node of a tablespace or the log data space */
typedef	struct fil_node_struct	fil_node_t; //这实际上就是代表一个文件，表空间文件或redo log文件。

struct fil_node_struct {
	fil_space_t*	space;	/*!< backpointer to the space where this node
				belongs */
	char*		name;	/*!< path to the file */
	ibool		open;	/*!< TRUE if file open */
	os_file_t	handle;	/*!< OS handle to the file, if file open */ //这就是文件描述符。
	ibool		is_raw_disk;/*!< TRUE if the 'file' is actually a raw
				device or a raw disk partition */
	ulint		size;	/*!< size of the file in database pages, 0 if
				not known yet; the possible last incomplete
				megabyte may be ignored if space == 0 */
	ulint		n_pending;
				/*!< count of pending i/o's on this file;
				closing of the file is not allowed if
				this is > 0 */
	ulint		n_pending_flushes;
				/*!< count of pending flushes on this file;
				closing of the file is not allowed if
				this is > 0 */
	ib_int64_t	modification_counter;/*!< when we write to the file we
				increment this by one */
	ib_int64_t	flush_counter;/*!< up to what
				modification_counter value we have
				flushed the modifications to disk */
	UT_LIST_NODE_T(fil_node_t) chain;
				/*!< link field for the file chain */
	UT_LIST_NODE_T(fil_node_t) LRU;
				/*!< link field for the LRU list */
	ulint		magic_n;/*!< FIL_NODE_MAGIC_N */
};

三、buf_pool_t* buf_pool_ptr

typedef struct buf_pool_struct buf_pool_t;

struct buf_pool_struct{

	/** @name General fields */
	/* @{ */
	mutex_t		mutex;		/*!< Buffer pool mutex of this instance */
	mutex_t		zip_mutex;	/*!< Zip mutex of this buffer
					pool instance, protects compressed
					only pages (of type buf_page_t, not
					buf_block_t */
	ulint		instance_no;	/*!< Array index of this buffer
					pool instance */
	ulint		old_pool_size;  /*!< Old pool size in bytes */
	ulint		curr_pool_size;	/*!< Current pool size in bytes */
	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
					pool for "old" blocks */
#ifdef UNIV_DEBUG
	ulint		buddy_n_frames; /*!< Number of frames allocated from
					the buffer pool to the buddy system */
#endif
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
#endif
	ulint		n_chunks;	/*!< number of buffer pool chunks */ //本次执行设为1
	buf_chunk_t*	chunks;		/*!< buffer pool chunks */
	ulint		curr_size;	/*!< current pool size in pages */
	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
					buf_block_t file pages,
					buf_page_in_file() == TRUE,
					indexed by (space_id, offset) */
	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
					whose frames are allocated to the
					zip buddy system,
					indexed by block->frame */
	ulint		n_pend_reads;	/*!< number of pending read
					operations */
	ulint		n_pend_unzip;	/*!< number of pending decompressions */

	time_t		last_printout_time;
					/*!< when buf_print_io was last time
					called */
	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES + 1];
					/*!< Statistics of buddy system,
					indexed by block size */
	buf_pool_stat_t	stat;		/*!< current statistics */
	buf_pool_stat_t	old_stat;	/*!< old statistics */

	/* @} */

	/** @name Page flushing algorithm fields */

	/* @{ */

	mutex_t		flush_list_mutex;/*!< mutex protecting the
					flush list access. This mutex
					protects flush_list, flush_rbt
					and bpage::list pointers when
					the bpage is on flush_list. It
					also protects writes to
					bpage::oldest_modification */
	UT_LIST_BASE_NODE_T(buf_page_t) flush_list;  //按脏页修改先后顺序排列的链表，使得当需要同步checkpoint时，可以根据页修改的先后顺序来将脏页写入持久存储。
					/*!< base node of the modified block list */
	ibool		init_flush[BUF_FLUSH_N_TYPES];
					/*!< this is TRUE when a flush of the
					given type is being initialized */
	ulint		n_flush[BUF_FLUSH_N_TYPES];
					/*!< this is the number of pending
					writes in the given flush type */
	os_event_t	no_flush[BUF_FLUSH_N_TYPES];
					/*!< this is in the set state
					when there is no flush batch
					of the given type running */
	ib_rbt_t*	flush_rbt;	/*!< a red-black tree is used
					exclusively during recovery to
					speed up insertions in the
					flush_list. This tree contains
					blocks in order of
					oldest_modification LSN and is
					kept in sync with the
					flush_list.
					Each member of the tree MUST
					also be on the flush_list.
					This tree is relevant only in
					recovery and is set to NULL
					once the recovery is over.
					Protected by flush_list_mutex */
	ulint		freed_page_clock;/*!< a sequence number used
					to count the number of buffer
					blocks removed from the end of
					the LRU list; NOTE that this
					counter may wrap around at 4
					billion! A thread is allowed
					to read this for heuristic
					purposes without holding any
					mutex or latch */
	ulint		LRU_flush_ended;/*!< when an LRU flush ends for a page,
					this is incremented by one; this is
					set to zero when a buffer block is
					allocated */
	/* @} */

	/** @name LRU replacement algorithm fields */
	/* @{ */

	UT_LIST_BASE_NODE_T(buf_page_t) free;
					/*!< base node of the free block list */
	UT_LIST_BASE_NODE_T(buf_page_t) LRU;                                  
					/*!< base node of the LRU list */
	buf_page_t*	LRU_old;	/*!< pointer to the about
					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
					oldest blocks in the LRU list;
					NULL if LRU length less than
					BUF_LRU_OLD_MIN_LEN;
					NOTE: when LRU_old != NULL, its length
					should always equal LRU_old_len */
	ulint		LRU_old_len;	/*!< length of the LRU list from
					the block to which LRU_old points
					onward, including that block;
					see buf0lru.c for the restrictions
					on this value; 0 if LRU_old == NULL;
					NOTE: LRU_old_len must be adjusted
					whenever LRU_old shrinks or grows! */

	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
					/*!< base node of the unzip_LRU list */   

	/* @} */
	/** @name Buddy allocator fields
	The buddy allocator is used for allocating compressed page
	frames and buf_page_t descriptors of blocks that exist
	in the buffer pool only in compressed form. */
	/* @{ */
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
					/*!< unmodified compressed pages */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES];
					/*!< buddy free lists */

	buf_page_t			watch[BUF_POOL_WATCH_SIZE];
					/*!< Sentinel records for buffer
					pool watches. Protected by
				       	buf_pool->mutex. */

#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
#endif
#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
#endif
	/* @} */
};

在buf_pool_init()中初始化，本次执行只创建一个实例。buf_pool_init()----->buf_chunk_init()，注意需要额外的空间为每个缓存页保留一个buf_block_t的空间。我们应该注意到，实际的获得buf_pool的页数与申请时的大小往往是不一致的。

buf_chunk_t*
buf_chunk_init(
/*===========*/
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
	ulint		mem_size)	/*!< in: requested size in bytes */
{
	buf_block_t*	block;
	byte*		frame;
	ulint		i;

	/* Round down to a multiple of page size,
	although it already should be. */
	mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
	/* Reserve space for the block descriptors. */
	mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
				  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);

	chunk->mem_size = mem_size;  //这个buf_chunk_t保存着这一大段内存。
	chunk->mem = os_mem_alloc_large(&chunk->mem_size); //内部调用mmap来获取一大段内存，MAP_PRIVATE | OS_MAP_ANON（此处后者指匿名内存映射：同时调用mmap时fd=-1）；mmap调用时就觉得映射大小，不能再增加。注意mmap与malloc的区别。

	if (UNIV_UNLIKELY(chunk->mem == NULL)) {

		return(NULL);
	}

	/* Allocate the block descriptors from
	the start of the memory block. */
	chunk->blocks = chunk->mem;  //开始处是block描述符。

	/* Align a pointer to the first frame.  Note that when
	os_large_page_size is smaller than UNIV_PAGE_SIZE,
	we may allocate one fewer block than requested.  When
	it is bigger, we may allocate more blocks than requested. */

	frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
		- (frame != chunk->mem);

	/* Subtract the space needed for block descriptors. */
	{   //以下用于计算出buf pool从哪里开始（开始处是多个buf_block_t），注意开始位置必须与16K对齐。
		ulint	size = chunk->size;

		while (frame < (byte*) (chunk->blocks + size)) {
			frame += UNIV_PAGE_SIZE;
			size--;
		}

		chunk->size = size;
	}

	/* Init block structs and assign frames for them. Then we
	assign the frames to the first blocks (we already mapped the
	memory above). */

	block = chunk->blocks;

	for (i = chunk->size; i--; ) {

		buf_block_init(buf_pool, block, frame);
		UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);

		/* Add the block to the free list */
		UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));  //buf_pool的buf_chunk_t* chunks->mem指向大缓存空间，开始处为多个buf_block_t，后面是多个页（每页16k）；buf_block_t中有buf_page_t page域，它包含页的信息，并最终连接到buf_pool->free中。（可以看到，free是buf_page_t的链表。）

		ut_d(block->page.in_free_list = TRUE);
		ut_ad(buf_pool_from_block(block) == buf_pool);

		block++;
		frame += UNIV_PAGE_SIZE;
	}

#ifdef PFS_GROUP_BUFFER_SYNC
	pfs_register_buffer_block(chunk);
#endif
	return(chunk);
}

四、trx_sys_t* trx_sys——transaction system

typedef struct trx_sys_struct trx_sys_t;

struct trx_sys_struct{
	trx_id_t	max_trx_id;	/*!< The smallest number not yet
					assigned as a transaction id or
					transaction number */
	UT_LIST_BASE_NODE_T(trx_t) trx_list;
					/*!< List of active and committed in
					memory transactions, sorted on trx id,
					biggest first */
	UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
					/*!< List of transactions created for MySQL */
	UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
					/*!< List of rollback segment objects */
	trx_rseg_t*	latest_rseg;	/*!< Latest rollback segment in the
					round-robin assignment of rollback
					segments to transactions */
	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
					/*!< Pointer array to rollback
					segments; NULL if slot not in use */
	ulint		rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
					list (update undo logs for committed
					transactions), protected by rseg->mutex */
	UT_LIST_BASE_NODE_T(read_view_t) view_list;
					/*!< List of read views sorted
					on trx no, biggest first */
};

初始化见trx_sys_init_at_db_start()，会首先获得kernel_mutex（mutex protecting the server, trx structs, query threads, and lock table）。其中trx_lists_init_at_db_start()是回滚事务（dummy）的建立。

void
trx_sys_init_at_db_start(void)
/*==========================*/
{
	trx_sysf_t*	sys_header;  //Transaction system header
	ib_uint64_t	rows_to_undo	= 0;
	const char*	unit		= "";
	trx_t*		trx;  //Transaction
	mtr_t		mtr;  //Mini-transaction handle and buffer
	ib_bh_t*	ib_bh;// Binary heap

	mtr_start(&mtr);  //初始化mtr

	ut_ad(trx_sys == NULL);

	mutex_enter(&kernel_mutex);

	/* We create the min binary heap here and pass ownership to
	purge when we init the purge sub-system. Purge is responsible
	for freeing the binary heap. */

	ib_bh = ib_bh_create(
		trx_rseg_compare_last_trx_no,
		sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);

	trx_sys = mem_zalloc(sizeof(*trx_sys));

	sys_header = trx_sysf_get(&mtr); //其中会调用bug_page_get（实际调用buf_page_get_gen）去进入database page获得一个header。

	trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); //Creates the memory copies for rollback segments and initializes the rseg list and array in trx_sys.其中会初始化128个rollback segment object（trx_rseg_t），并链接到trx_sys->rseg_list

	trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);

	/* VERY important: after the database is started, max_trx_id value is
	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
	trx_sys_get_new_trx_id will evaluate to TRUE when the function
	is first time called, and the value for trx id will be written
	to the disk-based header! Thus trx id values will not overlap when
	the database is repeatedly started! */

	trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
		+ ut_uint64_align_up(mach_read_from_8(sys_header
						   + TRX_SYS_TRX_ID_STORE),
				     TRX_SYS_TRX_ID_WRITE_MARGIN);

	UT_LIST_INIT(trx_sys->mysql_trx_list);
	trx_dummy_sess = sess_open();  //类型为sess_t，the session handle
	trx_lists_init_at_db_start();  //创建trx事务结构体，并初始化trx_sys->trx_list；它是根据回滚段和undo log list来创建的，这些事务将被回滚或清除。

	if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
		trx = UT_LIST_GET_FIRST(trx_sys->trx_list);

		for (;;) {

			if (trx->conc_state != TRX_PREPARED) {
				rows_to_undo += trx->undo_no;
			}

			trx = UT_LIST_GET_NEXT(trx_list, trx);

			if (!trx) {
				break;
			}
		}

		if (rows_to_undo > 1000000000) {
			unit = "M";
			rows_to_undo = rows_to_undo / 1000000;
		}

		fprintf(stderr,
			"InnoDB: %lu transaction(s) which must be"
			" rolled back or cleaned up\n"
			"InnoDB: in total %lu%s row operations to undo\n",
			(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
			(ulong) rows_to_undo, unit);

		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
			(ullint) trx_sys->max_trx_id);
	}

	UT_LIST_INIT(trx_sys->view_list);

	/* Transfer ownership to purge. */
	trx_purge_sys_create(ib_bh); //创建trx_pruge_t* purge_sys，purge_sys->ib_bh = ib_bh，purge_sys->state = TRX_STOP_PURGE，...
,
	mutex_exit(&kernel_mutex);

	mtr_commit(&mtr);
}

关于buf_page_get_gen()，这是一个关键函数，This is the general function used to get access to a database page，返回值为buf_block_t的指针。

本届涉及的结构体又引出了很多其他结构体，主要分为线程系统、文件系统、缓存系统、事务系统等（还有一个重要的锁），属于比较关键的数据结构，尚有很多问题没有搞清楚。下一篇从bug_page_get_gen()入手，来弄清楚数据文件与缓存的关系。