以下讨论的结构体或函数都是出现在innobase_start_or_create_for_mysql()中。
一、srv_sys_t* srv_sys——server system
typedef struct srv_sys_struct srv_sys_t
/** The server system struct */
struct srv_sys_struct{
srv_table_t* threads; /*!< server thread table */
UT_LIST_BASE_NODE_T(que_thr_t)
tasks; /*!< task queue */ //这是一个链表,其中的节点que_thr_t称为Query graph query thread node。
};
在srv_init()中,srv_sys->threads = mem_zalloc(OS_THREAD_MAX_N * sizeof(srv_slot_t))。typedef struct srv_slot_struct srv_slot_t;
/* Thread slot in the thread table */
struct srv_slot_struct{
unsigned type:1; /*!< thread type: user, utility etc. */
unsigned in_use:1; /*!< TRUE if this slot is in use */
unsigned suspended:1; /*!< TRUE if the thread is waiting
for the event of this slot */
ib_time_t suspend_time; /*!< time when the thread was
suspended */
os_event_t event; /*!< event used in suspending the
thread when it has nothing to do */
que_thr_t* thr; /*!< suspended query thread (only
used for MySQL threads) */
};
备注:该宏定义经常被用到:用于表示一个链表
#define UT_LIST_BASE_NODE_T(TYPE)\
struct {\
ulint count; /*!< count of nodes in list */\
TYPE * start; /*!< pointer to list start, NULL if empty */\
TYPE * end; /*!< pointer to list end, NULL if empty */\
}\
二、fil_system_t* fil_system
该结构体的初始化为fil_init();fil_open_log_and_system_tablespace_files()中打开共享表文件和redo logs。
typedef struct fil_system_struct fil_system_t;
/** The tablespace memory cache; also the totality of logs (the log
data space) is stored here; below we talk about tablespaces, but also
the ib_logfiles form a 'space' and it is handled here */
struct fil_system_struct {
#ifndef UNIV_HOTBACKUP
mutex_t mutex; /*!< The mutex protecting the cache */
#endif /* !UNIV_HOTBACKUP */
hash_table_t* spaces; /*!< The hash table of spaces in the
system; they are hashed on the space
id */
hash_table_t* name_hash; /*!< hash table based on the space
name */
UT_LIST_BASE_NODE_T(fil_node_t) LRU;
/*!< base node for the LRU list of the
most recently used open files with no
pending i/o's; if we start an i/o on
the file, we first remove it from this
list, and return it to the start of
the list when the i/o ends;
log files and the system tablespace are
not put to this list: they are opened
after the startup, and kept open until
shutdown */
UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
/*!< base node for the list of those
tablespaces whose files contain
unflushed writes; those spaces have
at least one file node where
modification_counter > flush_counter */
ulint n_open; /*!< number of files currently open */
ulint max_n_open; /*!< n_open is not allowed to exceed
this */
ib_int64_t modification_counter;/*!< when we write to a file we
increment this by one */
ulint max_assigned_id;/*!< maximum space id in the existing
tables, or assigned during the time
mysqld has been up; at an InnoDB
startup we scan the data dictionary
and set here the maximum of the
space id's of the tables there */
ib_int64_t tablespace_version;
/*!< a counter which is incremented for
every space object memory creation;
every space mem object gets a
'timestamp' from this; in DISCARD/
IMPORT this is used to check if we
should ignore an insert buffer merge
request */
UT_LIST_BASE_NODE_T(fil_space_t) space_list;
/*!< list of all file spaces */
ibool space_id_reuse_warned;
/* !< TRUE if fil_space_create()
has issued a warning about
potential space_id reuse */
};
注意:其中对LRU域的描述是:redo log和共享表空间时钟被打开,所以并不放入LRU中;因此LRU是针对独立表空间的,不过本例未使用独立表空间,所以此处的LRU实际上并没有使用。
/** File node of a tablespace or the log data space */
typedef struct fil_node_struct fil_node_t; //这实际上就是代表一个文件,表空间文件或redo log文件。
struct fil_node_struct {
fil_space_t* space; /*!< backpointer to the space where this node
belongs */
char* name; /*!< path to the file */
ibool open; /*!< TRUE if file open */
os_file_t handle; /*!< OS handle to the file, if file open */ //这就是文件描述符。
ibool is_raw_disk;/*!< TRUE if the 'file' is actually a raw
device or a raw disk partition */
ulint size; /*!< size of the file in database pages, 0 if
not known yet; the possible last incomplete
megabyte may be ignored if space == 0 */
ulint n_pending;
/*!< count of pending i/o's on this file;
closing of the file is not allowed if
this is > 0 */
ulint n_pending_flushes;
/*!< count of pending flushes on this file;
closing of the file is not allowed if
this is > 0 */
ib_int64_t modification_counter;/*!< when we write to the file we
increment this by one */
ib_int64_t flush_counter;/*!< up to what
modification_counter value we have
flushed the modifications to disk */
UT_LIST_NODE_T(fil_node_t) chain;
/*!< link field for the file chain */
UT_LIST_NODE_T(fil_node_t) LRU;
/*!< link field for the LRU list */
ulint magic_n;/*!< FIL_NODE_MAGIC_N */
};
三、buf_pool_t* buf_pool_ptr
typedef struct buf_pool_struct buf_pool_t;struct buf_pool_struct{
/** @name General fields */
/* @{ */
mutex_t mutex; /*!< Buffer pool mutex of this instance */
mutex_t zip_mutex; /*!< Zip mutex of this buffer
pool instance, protects compressed
only pages (of type buf_page_t, not
buf_block_t */
ulint instance_no; /*!< Array index of this buffer
pool instance */
ulint old_pool_size; /*!< Old pool size in bytes */
ulint curr_pool_size; /*!< Current pool size in bytes */
ulint LRU_old_ratio; /*!< Reserve this much of the buffer
pool for "old" blocks */
#ifdef UNIV_DEBUG
ulint buddy_n_frames; /*!< Number of frames allocated from
the buffer pool to the buddy system */
#endif
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
ulint mutex_exit_forbidden; /*!< Forbid release mutex */
#endif
ulint n_chunks; /*!< number of buffer pool chunks */ //本次执行设为1
buf_chunk_t* chunks; /*!< buffer pool chunks */
ulint curr_size; /*!< current pool size in pages */
hash_table_t* page_hash; /*!< hash table of buf_page_t or
buf_block_t file pages,
buf_page_in_file() == TRUE,
indexed by (space_id, offset) */
hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks
whose frames are allocated to the
zip buddy system,
indexed by block->frame */
ulint n_pend_reads; /*!< number of pending read
operations */
ulint n_pend_unzip; /*!< number of pending decompressions */
time_t last_printout_time;
/*!< when buf_print_io was last time
called */
buf_buddy_stat_t buddy_stat[BUF_BUDDY_SIZES + 1];
/*!< Statistics of buddy system,
indexed by block size */
buf_pool_stat_t stat; /*!< current statistics */
buf_pool_stat_t old_stat; /*!< old statistics */
/* @} */
/** @name Page flushing algorithm fields */
/* @{ */
mutex_t flush_list_mutex;/*!< mutex protecting the
flush list access. This mutex
protects flush_list, flush_rbt
and bpage::list pointers when
the bpage is on flush_list. It
also protects writes to
bpage::oldest_modification */
UT_LIST_BASE_NODE_T(buf_page_t) flush_list; //按脏页修改先后顺序排列的链表,使得当需要同步checkpoint时,可以根据页修改的先后顺序来将脏页写入持久存储。
/*!< base node of the modified block list */
ibool init_flush[BUF_FLUSH_N_TYPES];
/*!< this is TRUE when a flush of the
given type is being initialized */
ulint n_flush[BUF_FLUSH_N_TYPES];
/*!< this is the number of pending
writes in the given flush type */
os_event_t no_flush[BUF_FLUSH_N_TYPES];
/*!< this is in the set state
when there is no flush batch
of the given type running */
ib_rbt_t* flush_rbt; /*!< a red-black tree is used
exclusively during recovery to
speed up insertions in the
flush_list. This tree contains
blocks in order of
oldest_modification LSN and is
kept in sync with the
flush_list.
Each member of the tree MUST
also be on the flush_list.
This tree is relevant only in
recovery and is set to NULL
once the recovery is over.
Protected by flush_list_mutex */
ulint freed_page_clock;/*!< a sequence number used
to count the number of buffer
blocks removed from the end of
the LRU list; NOTE that this
counter may wrap around at 4
billion! A thread is allowed
to read this for heuristic
purposes without holding any
mutex or latch */
ulint LRU_flush_ended;/*!< when an LRU flush ends for a page,
this is incremented by one; this is
set to zero when a buffer block is
allocated */
/* @} */
/** @name LRU replacement algorithm fields */
/* @{ */
UT_LIST_BASE_NODE_T(buf_page_t) free;
/*!< base node of the free block list */
UT_LIST_BASE_NODE_T(buf_page_t) LRU;
/*!< base node of the LRU list */
buf_page_t* LRU_old; /*!< pointer to the about
LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
oldest blocks in the LRU list;
NULL if LRU length less than
BUF_LRU_OLD_MIN_LEN;
NOTE: when LRU_old != NULL, its length
should always equal LRU_old_len */
ulint LRU_old_len; /*!< length of the LRU list from
the block to which LRU_old points
onward, including that block;
see buf0lru.c for the restrictions
on this value; 0 if LRU_old == NULL;
NOTE: LRU_old_len must be adjusted
whenever LRU_old shrinks or grows! */
UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
/*!< base node of the unzip_LRU list */
/* @} */
/** @name Buddy allocator fields
The buddy allocator is used for allocating compressed page
frames and buf_page_t descriptors of blocks that exist
in the buffer pool only in compressed form. */
/* @{ */
#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
UT_LIST_BASE_NODE_T(buf_page_t) zip_clean;
/*!< unmodified compressed pages */
#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES];
/*!< buddy free lists */
buf_page_t watch[BUF_POOL_WATCH_SIZE];
/*!< Sentinel records for buffer
pool watches. Protected by
buf_pool->mutex. */
#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
#endif
#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
#endif
/* @} */
};
在buf_pool_init()中初始化,本次执行只创建一个实例。buf_pool_init()----->buf_chunk_init(),注意需要额外的空间为每个缓存页保留一个buf_block_t的空间。我们应该注意到,实际的获得buf_pool的页数与申请时的大小往往是不一致的。
buf_chunk_t*
buf_chunk_init(
/*===========*/
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
buf_chunk_t* chunk, /*!< out: chunk of buffers */
ulint mem_size) /*!< in: requested size in bytes */
{
buf_block_t* block;
byte* frame;
ulint i;
/* Round down to a multiple of page size,
although it already should be. */
mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
/* Reserve space for the block descriptors. */
mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
+ (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
chunk->mem_size = mem_size; //这个buf_chunk_t保存着这一大段内存。
chunk->mem = os_mem_alloc_large(&chunk->mem_size); //内部调用mmap来获取一大段内存,MAP_PRIVATE | OS_MAP_ANON(此处后者指匿名内存映射:同时调用mmap时fd=-1);mmap调用时就觉得映射大小,不能再增加。注意mmap与malloc的区别。
if (UNIV_UNLIKELY(chunk->mem == NULL)) {
return(NULL);
}
/* Allocate the block descriptors from
the start of the memory block. */
chunk->blocks = chunk->mem; //开始处是block描述符。
/* Align a pointer to the first frame. Note that when
os_large_page_size is smaller than UNIV_PAGE_SIZE,
we may allocate one fewer block than requested. When
it is bigger, we may allocate more blocks than requested. */
frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
- (frame != chunk->mem);
/* Subtract the space needed for block descriptors. */
{ //以下用于计算出buf pool从哪里开始(开始处是多个buf_block_t),注意开始位置必须与16K对齐。
ulint size = chunk->size;
while (frame < (byte*) (chunk->blocks + size)) {
frame += UNIV_PAGE_SIZE;
size--;
}
chunk->size = size;
}
/* Init block structs and assign frames for them. Then we
assign the frames to the first blocks (we already mapped the
memory above). */
block = chunk->blocks;
for (i = chunk->size; i--; ) {
buf_block_init(buf_pool, block, frame);
UNIV_MEM_INVALID(block->frame, UNIV_PAGE_SIZE);
/* Add the block to the free list */
UT_LIST_ADD_LAST(list, buf_pool->free, (&block->page)); //buf_pool的buf_chunk_t* chunks->mem指向大缓存空间,开始处为多个buf_block_t,后面是多个页(每页16k);buf_block_t中有buf_page_t page域,它包含页的信息,并最终连接到buf_pool->free中。(可以看到,free是buf_page_t的链表。)
ut_d(block->page.in_free_list = TRUE);
ut_ad(buf_pool_from_block(block) == buf_pool);
block++;
frame += UNIV_PAGE_SIZE;
}
#ifdef PFS_GROUP_BUFFER_SYNC
pfs_register_buffer_block(chunk);
#endif
return(chunk);
}
四、trx_sys_t* trx_sys——transaction system
typedef struct trx_sys_struct trx_sys_t;
struct trx_sys_struct{
trx_id_t max_trx_id; /*!< The smallest number not yet
assigned as a transaction id or
transaction number */
UT_LIST_BASE_NODE_T(trx_t) trx_list;
/*!< List of active and committed in
memory transactions, sorted on trx id,
biggest first */
UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
/*!< List of transactions created for MySQL */
UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
/*!< List of rollback segment objects */
trx_rseg_t* latest_rseg; /*!< Latest rollback segment in the
round-robin assignment of rollback
segments to transactions */
trx_rseg_t* rseg_array[TRX_SYS_N_RSEGS];
/*!< Pointer array to rollback
segments; NULL if slot not in use */
ulint rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
list (update undo logs for committed
transactions), protected by rseg->mutex */
UT_LIST_BASE_NODE_T(read_view_t) view_list;
/*!< List of read views sorted
on trx no, biggest first */
};
初始化见trx_sys_init_at_db_start(),会首先获得kernel_mutex(mutex protecting the server, trx structs, query threads, and lock table)。其中trx_lists_init_at_db_start()是回滚事务(dummy)的建立。void
trx_sys_init_at_db_start(void)
/*==========================*/
{
trx_sysf_t* sys_header; //Transaction system header
ib_uint64_t rows_to_undo = 0;
const char* unit = "";
trx_t* trx; //Transaction
mtr_t mtr; //Mini-transaction handle and buffer
ib_bh_t* ib_bh;// Binary heap
mtr_start(&mtr); //初始化mtr
ut_ad(trx_sys == NULL);
mutex_enter(&kernel_mutex);
/* We create the min binary heap here and pass ownership to
purge when we init the purge sub-system. Purge is responsible
for freeing the binary heap. */
ib_bh = ib_bh_create(
trx_rseg_compare_last_trx_no,
sizeof(rseg_queue_t), TRX_SYS_N_RSEGS);
trx_sys = mem_zalloc(sizeof(*trx_sys));
sys_header = trx_sysf_get(&mtr); //其中会调用bug_page_get(实际调用buf_page_get_gen)去进入database page获得一个header。
trx_rseg_list_and_array_init(sys_header, ib_bh, &mtr); //Creates the memory copies for rollback segments and initializes the rseg list and array in trx_sys.其中会初始化128个rollback segment object(trx_rseg_t),并链接到trx_sys->rseg_list
trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
/* VERY important: after the database is started, max_trx_id value is
divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
trx_sys_get_new_trx_id will evaluate to TRUE when the function
is first time called, and the value for trx id will be written
to the disk-based header! Thus trx id values will not overlap when
the database is repeatedly started! */
trx_sys->max_trx_id = 2 * TRX_SYS_TRX_ID_WRITE_MARGIN
+ ut_uint64_align_up(mach_read_from_8(sys_header
+ TRX_SYS_TRX_ID_STORE),
TRX_SYS_TRX_ID_WRITE_MARGIN);
UT_LIST_INIT(trx_sys->mysql_trx_list);
trx_dummy_sess = sess_open(); //类型为sess_t,the session handle
trx_lists_init_at_db_start(); //创建trx事务结构体,并初始化trx_sys->trx_list;它是根据回滚段和undo log list来创建的,这些事务将被回滚或清除。
if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
for (;;) {
if (trx->conc_state != TRX_PREPARED) {
rows_to_undo += trx->undo_no;
}
trx = UT_LIST_GET_NEXT(trx_list, trx);
if (!trx) {
break;
}
}
if (rows_to_undo > 1000000000) {
unit = "M";
rows_to_undo = rows_to_undo / 1000000;
}
fprintf(stderr,
"InnoDB: %lu transaction(s) which must be"
" rolled back or cleaned up\n"
"InnoDB: in total %lu%s row operations to undo\n",
(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
(ulong) rows_to_undo, unit);
fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
(ullint) trx_sys->max_trx_id);
}
UT_LIST_INIT(trx_sys->view_list);
/* Transfer ownership to purge. */
trx_purge_sys_create(ib_bh); //创建trx_pruge_t* purge_sys,purge_sys->ib_bh = ib_bh,purge_sys->state = TRX_STOP_PURGE,...
,
mutex_exit(&kernel_mutex);
mtr_commit(&mtr);
}
关于buf_page_get_gen(),这是一个关键函数,This is the general function used to get access to a database page,返回值为buf_block_t的指针。
本届涉及的结构体又引出了很多其他结构体,主要分为线程系统、文件系统、缓存系统、事务系统等(还有一个重要的锁),属于比较关键的数据结构,尚有很多问题没有搞清楚。下一篇从bug_page_get_gen()入手,来弄清楚数据文件与缓存的关系。