6 InnoDB相关的数据结构

本文深入探讨了InnoDB存储引擎中的几个核心数据结构,包括srv_sys_t、fil_system_t、buf_pool_t和trx_sys_t。通过对这些结构的详细解析,揭示了InnoDB如何管理和操作数据库的线程、文件、缓冲池及事务。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

以下讨论的结构体或函数都是出现在innobase_start_or_create_for_mysql()中。

一、srv_sys_t*  srv_sys——server system

typedef struct srv_sys_struct	srv_sys_t
/** The server system struct */
struct srv_sys_struct{
	srv_table_t*	threads;	/*!< server thread table */
	UT_LIST_BASE_NODE_T(que_thr_t)
			tasks;		/*!< task queue */  //这是一个链表,其中的节点que_thr_t称为Query graph query thread node。
};
在srv_init()中,srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t))。

typedef struct srv_slot_struct srv_slot_t;

/* Thread slot in the thread table */
struct srv_slot_struct{
	unsigned	type:1;		/*!< thread type: user, utility etc. */
	unsigned	in_use:1;	/*!< TRUE if this slot is in use */
	unsigned	suspended:1;	/*!< TRUE if the thread is waiting
					for the event of this slot */
	ib_time_t	suspend_time;	/*!< time when the thread was
					suspended */
	os_event_t	event;		/*!< event used in suspending the
					thread when it has nothing to do */
	que_thr_t*	thr;		/*!< suspended query thread (only
					used for MySQL threads) */
};


备注:该宏定义经常被用到:用于表示一个链表

#define UT_LIST_BASE_NODE_T(TYPE)\
struct {\
	ulint	count;	/*!< count of nodes in list */\
	TYPE *	start;	/*!< pointer to list start, NULL if empty */\
	TYPE *	end;	/*!< pointer to list end, NULL if empty */\
}\

二、fil_system_t* fil_system

该结构体的初始化为fil_init();fil_open_log_and_system_tablespace_files()中打开共享表文件和redo logs。

typedef	struct fil_system_struct	fil_system_t;
/** The tablespace memory cache; also the totality of logs (the log
data space) is stored here; below we talk about tablespaces, but also
the ib_logfiles form a 'space' and it is handled here */

struct fil_system_struct {
#ifndef UNIV_HOTBACKUP
	mutex_t		mutex;		/*!< The mutex protecting the cache */
#endif /* !UNIV_HOTBACKUP */
	hash_table_t*	spaces;		/*!< The hash table of spaces in the
					system; they are hashed on the space
					id */
	hash_table_t*	name_hash;	/*!< hash table based on the space
					name */
	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
					/*!< base node for the LRU list of the
					most recently used open files with no
					pending i/o's; if we start an i/o on
					the file, we first remove it from this
					list, and return it to the start of
					the list when the i/o ends;
					log files and the system tablespace are
					not put to this list: they are opened
					after the startup, and kept open until
					shutdown */
	UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
					/*!< base node for the list of those
					tablespaces whose files contain
					unflushed writes; those spaces have
					at least one file node where
					modification_counter > flush_counter */
	ulint		n_open;		/*!< number of files currently open */
	ulint		max_n_open;	/*!< n_open is not allowed to exceed
					this */
	ib_int64_t	modification_counter;/*!< when we write to a file we
					increment this by one */
	ulint		max_assigned_id;/*!< maximum space id in the existing
					tables, or assigned during the time
					mysqld has been up; at an InnoDB
					startup we scan the data dictionary
					and set here the maximum of the
					space id's of the tables there */
	ib_int64_t	tablespace_version;
					/*!< a counter which is incremented for
					every space object memory creation;
					every space mem object gets a
					'timestamp' from this; in DISCARD/
					IMPORT this is used to check if we
					should ignore an insert buffer merge
					request */
	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
					/*!< list of all file spaces */
	ibool		space_id_reuse_warned;
					/* !< TRUE if fil_space_create()
					has issued a warning about
					potential space_id reuse */
};

注意:其中对LRU域的描述是:redo log和共享表空间时钟被打开,所以并不放入LRU中;因此LRU是针对独立表空间的,不过本例未使用独立表空间,所以此处的LRU实际上并没有使用。

/** File node of a tablespace or the log data space */
typedef	struct fil_node_struct	fil_node_t; //这实际上就是代表一个文件,表空间文件或redo log文件。

struct fil_node_struct {
	fil_space_t*	space;	/*!< backpointer to the space where this node
				belongs */
	char*		name;	/*!< path to the file */
	ibool		open;	/*!< TRUE if file open */
	os_file_t	handle;	/*!< OS handle to the file, if file open */ //这就是文件描述符。
	ibool		is_raw_disk;/*!< TRUE if the 'file' is actually a raw
				device or a raw disk partition */
	ulint		size;	/*!< size of the file in database pages, 0 if
				not known yet; the possible last incomplete
				megabyte may be ignored if space == 0 */
	ulint		n_pending;
				/*!< count of pending i/o's on this file;
				closing of the file is not allowed if
				this is > 0 */
	ulint		n_pending_flushes;
				/*!< count of pending flushes on this file;
				closing of the file is not allowed if
				this is > 0 */
	ib_int64_t	modification_counter;/*!< when we write to the file we
				increment this by one */
	ib_int64_t	flush_counter;/*!< up to what
				modification_counter value we have
				flushed the modifications to disk */
	UT_LIST_NODE_T(fil_node_t) chain;
				/*!< link field for the file chain */
	UT_LIST_NODE_T(fil_node_t) LRU;
				/*!< link field for the LRU list */
	ulint		magic_n;/*!< FIL_NODE_MAGIC_N */
};

三、buf_pool_t* buf_pool_ptr

typedef struct buf_pool_struct buf_pool_t;
struct buf_pool_struct{

	/** @name General fields */
	/* @{ */
	mutex_t		mutex;		/*!< Buffer pool mutex of this instance */
	mutex_t		zip_mutex;	/*!< Zip mutex of this buffer
					pool instance, protects compressed
					only pages (of type buf_page_t, not
					buf_block_t */
	ulint		instance_no;	/*!< Array index of this buffer
					pool instance */
	ulint		old_pool_size;  /*!< Old pool size in bytes */
	ulint		curr_pool_size;	/*!< Current pool size in bytes */
	ulint		LRU_old_ratio;  /*!< Reserve this much of the buffer
					pool for "old" blocks */
#ifdef UNIV_DEBUG
	ulint		buddy_n_frames; /*!< Number of frames allocated from
					the buffer pool to the buddy system */
#endif
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
	ulint		mutex_exit_forbidden; /*!< Forbid release mutex */
#endif
	ulint		n_chunks;	/*!< number of buffer pool chunks */ //本次执行设为1
	buf_chunk_t*	chunks;		/*!< buffer pool chunks */
	ulint		curr_size;	/*!< current pool size in pages */
	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
					buf_block_t file pages,
					buf_page_in_file() == TRUE,
					indexed by (space_id, offset) */
	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
					whose frames are allocated to the
					zip buddy system,
					indexed by block->frame */
	ulint		n_pend_reads;	/*!< number of pending read
					operations */
	ulint		n_pend_unzip;	/*!< number of pending decompressions */

	time_t		last_printout_time;
					/*!< when buf_print_io was last time
					called */
	buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES + 1];
					/*!< Statistics of buddy system,
					indexed by block size */
	buf_pool_stat_t	stat;		/*!< current statistics */
	buf_pool_stat_t	old_stat;	/*!< old statistics */

	/* @} */

	/** @name Page flushing algorithm fields */

	/* @{ */

	mutex_t		flush_list_mutex;/*!< mutex protecting the
					flush list access. This mutex
					protects flush_list, flush_rbt
					and bpage::list pointers when
					the bpage is on flush_list. It
					also protects writes to
					bpage::oldest_modification */
	UT_LIST_BASE_NODE_T(buf_page_t) flush_list;  //按脏页修改先后顺序排列的链表,使得当需要同步checkpoint时,可以根据页修改的先后顺序来将脏页写入持久存储。
					/*!< base node of the modified block list */
	ibool		init_flush[BUF_FLUSH_N_TYPES];
					/*!< this is TRUE when a flush of the
					given type is being initialized */
	ulint		n_flush[BUF_FLUSH_N_TYPES];
					/*!< this is the number of pending
					writes in the given flush type */
	os_event_t	no_flush[BUF_FLUSH_N_TYPES];
					/*!< this is in the set state
					when there is no flush batch
					of the given type running */
	ib_rbt_t*	flush_rbt;	/*!< a red-black tree is used
					exclusively during recovery to
					speed up insertions in the
					flush_list. This tree contains
					blocks in order of
					oldest_modification LSN and is
					kept in sync with the
					flush_list.
					Each member of the tree MUST
					also be on the flush_list.
					This tree is relevant only in
					recovery and is set to NULL
					once the recovery is over.
					Protected by flush_list_mutex */
	ulint		freed_page_clock;/*!< a sequence number used
					to count the number of buffer
					blocks removed from the end of
					the LRU list; NOTE that this
					counter may wrap around at 4
					billion! A thread is allowed
					to read this for heuristic
					purposes without holding any
					mutex or latch */
	ulint		LRU_flush_ended;/*!< when an LRU flush ends for a page,
					this is incremented by one; this is
					set to zero when a buffer block is
					allocated */
	/* @} */

	/** @name LRU replacement algorithm fields */
	/* @{ */

	UT_LIST_BASE_NODE_T(buf_page_t) free;
					/*!< base node of the free block list */
	UT_LIST_BASE_NODE_T(buf_page_t) LRU;                                  
					/*!< base node of the LRU list */
	buf_page_t*	LRU_old;	/*!< pointer to the about
					LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
					oldest blocks in the LRU list;
					NULL if LRU length less than
					BUF_LRU_OLD_MIN_LEN;
					NOTE: when LRU_old != NULL, its length
					should always equal LRU_old_len */
	ulint		LRU_old_len;	/*!< length of the LRU list from
					the block to which LRU_old points
					onward, including that block;
					see buf0lru.c for the restrictions
					on this value; 0 if LRU_old == NULL;
					NOTE: LRU_old_len must be adjusted
					whenever LRU_old shrinks or grows! */

	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
					/*!< base node of the unzip_LRU list */   

	/* @} */
	/** @name Buddy allocator fields
	The buddy allocator is used for allocating compressed page
	frames and buf_page_t descriptors of blocks that exist
	in the buffer pool only in compressed form. */
	/* @{ */
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
					/*!< unmodified compressed pages */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES];
					/*!< buddy free lists */

	buf_page_t			watch[BUF_POOL_WATCH_SIZE];
					/*!< Sentinel records for buffer
					pool watches. Protected by
				       	buf_pool->mutex. */

#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
#endif
#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
#endif
	/* @} */
};

在buf_pool_init()中初始化,本次执行只创建一个实例。buf_pool_init()----->buf_chunk_init(),注意需要额外的空间为每个缓存页保留一个buf_block_t的空间。我们应该注意到,实际的获得buf_pool的页数与申请时的大小往往是不一致的。

buf_chunk_t*
buf_chunk_init(
/*===========*/
	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
	ulint		mem_size)	/*!< in: requested size in bytes */
{
	buf_block_t*	block;
	byte*		frame;
	ulint		i;

	/* Round down to a multiple of page size,
	although it already should be. */
	mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
	/* Reserve space for the block descriptors. */
	mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
				  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);

	chunk->mem_size = mem_size;  //这个buf_chunk_t保存着这一大段内存。
	chunk->mem = os_mem_alloc_large(&chunk->mem_size); //内部调用mmap来获取一大段内存,MAP_PRIVATE | OS_MAP_ANON(此处后者指匿名内存映射:同时调用mmap时fd=-1);mmap调用时就觉得映射大小,不能再增加。注意mmap与malloc的区别。

	if (UNIV_UNLIKELY(chunk->mem == NULL)) {

		return(NULL);
	}

	/* Allocate the block descriptors from
	the start of the memory block. */
	chunk->blocks = chunk->mem;  //开始处是block描述符。

	/* Align a pointer to the first frame.  Note that when
	os_large_page_size is smaller than UNIV_PAGE_SIZE,
	we may allocate one fewer block than requested.  When
	it is bigger, we may allocate more blocks than requested. */

	frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
		- (frame != chunk->mem);

	/* Subtract the space needed for block descriptors. */
	{   //以下用于计算出buf pool从哪里开始(开始处是多个buf_block_t),注意开始位置必须与16K对齐。
		ulint	size = chunk->size;

		while (frame < (byte*) (chunk->blocks + size)) {
			frame += UNIV_PAGE_SIZE;
			size--;
		}

		chunk->size = size;
	}

	/* Init block structs and assign frames for them. Then we
	assign the frames to the first blocks (we already mapped the
	memory above). */

	block = chunk->blocks;

	for (i = chunk->size; i--; ) {

		buf_block_init(buf_pool, block, frame);
		UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);

		/* Add the block to the free list */
		UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page));  //buf_pool的buf_chunk_t* chunks->mem指向大缓存空间,开始处为多个buf_block_t,后面是多个页(每页16k);buf_block_t中有buf_page_t page域,它包含页的信息,并最终连接到buf_pool->free中。(可以看到,free是buf_page_t的链表。)

		ut_d(block->page.in_free_list = TRUE);
		ut_ad(buf_pool_from_block(block) == buf_pool);

		block++;
		frame += UNIV_PAGE_SIZE;
	}

#ifdef PFS_GROUP_BUFFER_SYNC
	pfs_register_buffer_block(chunk);
#endif
	return(chunk);
}

四、trx_sys_t*  trx_sys——transaction system

typedef struct trx_sys_struct trx_sys_t;

struct trx_sys_struct{
	trx_id_t	max_trx_id;	/*!< The smallest number not yet
					assigned as a transaction id or
					transaction number */
	UT_LIST_BASE_NODE_T(trx_t) trx_list;
					/*!< List of active and committed in
					memory transactions, sorted on trx id,
					biggest first */
	UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
					/*!< List of transactions created for MySQL */
	UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
					/*!< List of rollback segment objects */
	trx_rseg_t*	latest_rseg;	/*!< Latest rollback segment in the
					round-robin assignment of rollback
					segments to transactions */
	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
					/*!< Pointer array to rollback
					segments; NULL if slot not in use */
	ulint		rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
					list (update undo logs for committed
					transactions), protected by rseg->mutex */
	UT_LIST_BASE_NODE_T(read_view_t) view_list;
					/*!< List of read views sorted
					on trx no, biggest first */
};
初始化见trx_sys_init_at_db_start(),会首先获得kernel_mutex(mutex protecting the server, trx structs, query threads, and lock table)。其中trx_lists_init_at_db_start()是回滚事务(dummy)的建立。
void
trx_sys_init_at_db_start(void)
/*==========================*/
{
	trx_sysf_t*	sys_header;  //Transaction system header
	ib_uint64_t	rows_to_undo	= 0;
	const char*	unit		= "";
	trx_t*		trx;  //Transaction
	mtr_t		mtr;  //Mini-transaction handle and buffer
	ib_bh_t*	ib_bh;// Binary heap

	mtr_start(&mtr);  //初始化mtr

	ut_ad(trx_sys == NULL);

	mutex_enter(&kernel_mutex);

	/* We create the min binary heap here and pass ownership to
	purge when we init the purge sub-system. Purge is responsible
	for freeing the binary heap. */

	ib_bh = ib_bh_create(
		trx_rseg_compare_last_trx_no,
		sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);

	trx_sys = mem_zalloc(sizeof(*trx_sys));

	sys_header = trx_sysf_get(&mtr); //其中会调用bug_page_get(实际调用buf_page_get_gen)去进入database page获得一个header。

	trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); //Creates the memory copies for rollback segments and initializes the rseg list and array in trx_sys.其中会初始化128个rollback segment object(trx_rseg_t),并链接到trx_sys->rseg_list

	trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);

	/* VERY important: after the database is started, max_trx_id value is
	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
	trx_sys_get_new_trx_id will evaluate to TRUE when the function
	is first time called, and the value for trx id will be written
	to the disk-based header! Thus trx id values will not overlap when
	the database is repeatedly started! */

	trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
		+ ut_uint64_align_up(mach_read_from_8(sys_header
						   + TRX_SYS_TRX_ID_STORE),
				     TRX_SYS_TRX_ID_WRITE_MARGIN);

	UT_LIST_INIT(trx_sys->mysql_trx_list);
	trx_dummy_sess = sess_open();  //类型为sess_t,the session handle
	trx_lists_init_at_db_start();  //创建trx事务结构体,并初始化trx_sys->trx_list;它是根据回滚段和undo log list来创建的,这些事务将被回滚或清除。

	if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
		trx = UT_LIST_GET_FIRST(trx_sys->trx_list);

		for (;;) {

			if (trx->conc_state != TRX_PREPARED) {
				rows_to_undo += trx->undo_no;
			}

			trx = UT_LIST_GET_NEXT(trx_list, trx);

			if (!trx) {
				break;
			}
		}

		if (rows_to_undo > 1000000000) {
			unit = "M";
			rows_to_undo = rows_to_undo / 1000000;
		}

		fprintf(stderr,
			"InnoDB: %lu transaction(s) which must be"
			" rolled back or cleaned up\n"
			"InnoDB: in total %lu%s row operations to undo\n",
			(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
			(ulong) rows_to_undo, unit);

		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
			(ullint) trx_sys->max_trx_id);
	}

	UT_LIST_INIT(trx_sys->view_list);

	/* Transfer ownership to purge. */
	trx_purge_sys_create(ib_bh); //创建trx_pruge_t* purge_sys,purge_sys->ib_bh = ib_bh,purge_sys->state = TRX_STOP_PURGE,...
,
	mutex_exit(&kernel_mutex);

	mtr_commit(&mtr);
}

关于buf_page_get_gen(),这是一个关键函数,This is the general function used to get access to a database page,返回值为buf_block_t的指针。


本届涉及的结构体又引出了很多其他结构体,主要分为线程系统、文件系统、缓存系统、事务系统等(还有一个重要的锁),属于比较关键的数据结构,尚有很多问题没有搞清楚。下一篇从bug_page_get_gen()入手,来弄清楚数据文件与缓存的关系。


<think>好的,我现在需要回答用户关于MySQL InnoDB存储引擎数据结构的问题。首先,我要回忆一下之前用户提供的引用内容,里面有关于InnoDB结构、内存和磁盘结构、索引等方面的信息。用户现在想详细了解数据结构,可能需要涵盖表结构、行格式、索引结构等部分。 首先,我需要确定InnoDB数据结构主要分为哪几部分。根据引用3,InnoDB的结构分为内存和磁盘结构。但用户可能更关注磁盘上的数据结构,比如表空间、行格式、索引的实现方式。不过内存结构中的缓冲池、写缓冲等也可能影响数据处理的效率,需要简要提及。 接下来,表空间的结构。引用1提到系统表空间存储元数据,还有.frm文件的元数据重叠。但InnoDB在MySQL8.0之后已经将元数据存储在系统表空间,而不是.frm文件了,可能需要说明这一点。表空间包括系统表空间、用户表空间(每个表的.ibd文件)、Undo表空间和临时表空间。然后是行格式,引用2提到了CHAR(M)列的存储格式,以及当没有主键时隐藏的row_id。InnoDB支持的行格式有COMPACT、DYNAMIC等,DYNAMIC是默认的,处理可变长度列更高效。行格式的结构包括记录的额外信息和真实数据部分,比如变长字段长度列表、NULL值列表、记录头信息等。索引结构方面,引用4提到InnoDB使用B+树索引,而哈希索引只适用于特定情况。主键索引是聚簇索引,叶子节点存储完整数据,辅助索引存储主键值。需要解释B+树的结构和优势,比如范围查询、高扇出减少IO次数。另外,隐藏列如row_id,当没有主键时自动生成,作为聚簇索引的依据。事务ID和回滚指针也是行结构的一部分,支持MVCC。 内存结构中的缓冲池(BufferPool)缓存数据和索引页,写缓冲(Change Buffer)优化非唯一辅助索引的更新,日志缓冲(LogBuffer)提高事务提交效率。这些虽然属于内存管理,但也影响数据访问的结构和性能。需要确保回答结构清晰,分点说明,每个部分用例子或结构图辅助理解。还要注意用户提到的LaTeX格式,所有行内数学公式用$...$,独立公式用$$...$$,比如B+树的高度计算可以用公式表示。最后,生成相关问题,可能涉及不同行格式的对比、索引优化策略、MVCC机制等。检查是否有遗漏的重要点,比如页结构(16KB的页,包含行记录和页头信息),溢出页处理(对于大字段使用DYNAMIC行格式)等。确保引用正确,比如引用2提到的隐藏列,引用4的B+树结构,引用3的内存组成部分。</think>### MySQL InnoDB存储引擎数据结构详解 #### 一、表空间结构 InnoDB的数据存储以**表空间**为核心,分为四类: 1. **系统表空间**:存储数据字典、双写缓冲、写缓冲等元数据[^1],默认文件为`ibdata1`。 2. **用户表空间**(.ibd文件):每个独立表对应一个文件,存储表数据和索引。 3. **Undo表空间**:存储事务回滚所需的Undo日志。 4. **临时表空间**:存储临时表数据。 > **注**:MySQL 8.0后元数据完全存储在系统表空间,不再依赖`.frm`文件[^1]。 --- #### 二、行格式与存储结构 InnoDB支持多种行格式,默认使用`DYNAMIC`,其核心结构如下: ```plaintext +---------------------+-------------------+-------------------+ | 变长字段长度列表 | NULL值标志位列表 | 记录头信息(6字节) | +---------------------+-------------------+-------------------+ | 事务ID(6字节) | 回滚指针(7字节) | 列数据(实际值) | +---------------------+-------------------+-------------------+ ``` 1. **隐藏列**: - 若未定义主键,自动生成`row_id`(6字节)作为聚簇索引键[^2]。 - 每行包含事务ID(`DB_TRX_ID`)和回滚指针(`DB_ROLL_PTR`),用于MVCC机制。 2. **CHAR(M)存储**: - 对于变长字符集(如UTF8),按实际长度存储,而非固定$M$字节[^2]。 3. **溢出页**: - 当单行数据超过页大小(16KB)时,`DYNAMIC`格式仅存储20字节指针,数据存于溢出页。 --- #### 三、索引结构 1. **B+树索引**: - **聚簇索引**:叶子节点存储完整数据行,物理有序。 - **辅助索引**:叶子节点存储主键值,查询需回表。 - 高度计算:若每个节点存储$n$个键,树高为$h$,则最多可索引$n^{h}$条记录。例如,单页存储1000键,3层B+树可支持$10^9$条记录[^4]。 2. **哈希索引**: - 仅用于自适应哈希索引(内存结构),无法替代B+树的范围查询能力[^4]。 --- #### 四、内存结构关联 1. **缓冲池(Buffer Pool)**: - 缓存数据页与索引页,采用改进LRU算法管理。 - 查询时优先访问缓冲池,减少磁盘IO[^3]。 2. **写缓冲(Change Buffer)**: - 对非唯一辅助索引的更新操作(INSERT/UPDATE)进行缓存,合并写入磁盘[^3]。 3. **日志缓冲(Log Buffer)**: - 事务提交前暂存redo日志,支持三种刷盘策略(1秒刷盘/事务提交刷盘/折中模式)[^3]。 --- #### 五、示例分析 以表`record_format_demo`为例: ```sql CREATE TABLE record_format_demo (c1 INT, c2 VARCHAR(100)); ``` - 若未定义主键,实际行结构包含: - 隐藏列`row_id` - 事务ID与回滚指针 - 变长字段长度列表(记录`c2`的实际长度) ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值