摘要:
monetdb的事务隔离级别只有RR, 和mysql/innodb的具有RR和RC两个常用的隔离级别并且也实现了读未提交和可串行化的隔离级别不同.
本文分析monetdb的RR隔离级别的实现方式, 以及分析这种隔离级别方式如何导致只有RR隔离级别.
测试流程:
测试方式:
- 分别开两个mclient终端, 连接同一个mserver实例
- 两个client终端分别叫做客户端A和客户端B
- 客户端A开启auto commit, 执行的sql为插入一条数据
- insert into t1 values(3);
- 客户端B开启start transaction, 之心的sql为统计数据的数量
- start transaction;
- select count(1) from t1;
- 客户端A插入一条数据后, 在客户端B执行select count(1) 查看数据的数量
测试结果:
- 无论客户端A插入多少条数据
- 客户端B执行了start transaction后,通过select count(1)查询出的数据的数量永远不变
- 也就是说客户端B只能读到start transaction执行之前的数据
- 客户端B的隔离级别为可重复读,read repeated, 即RR
RR的隔离级别的原因定位方式:
- 客户端A是auto commit的方式, 每插入一条数据就会自动提交事务
- 对客户端B而言, 数据的可见性来说,A插入的数据, 都是已经提交的
- 但是B看不到A插入的数据, 那么对来说, 必然是存在某种规则对数据进行了过滤
- 要排查的话, 应该从B的角度, 追查为什么无法拿到A插入的数据
排查过程:
一. 通过日志分析select count的执行的核心函数:
日志:
2023-11-08 08:42:24 M_DEBUG ALGO client3 monetdb5/mal/mal_interpreter.c:716 runMALsequence calling querylog.define
2023-11-08 08:42:24 M_DEBUG ALGO client3 monetdb5/mal/mal_interpreter.c:687 runMALsequence calling sql.count
2023-11-08 08:45:56 M_DEBUG ALGO client3 monetdb5/mal/mal_interpreter.c:687 runMALsequence calling sql.resultSet
2023-11-08 08:45:56 M_DEBUG ALGO client3 gdk/gdk_bat.c:292 COLnew2 -> tmp_647#0@0[lng]TSRN
2023-11-08 08:45:56 M_DEBUG ALGO client3 gdk/gdk_bat.c:1039 BUNappendmulti tmp_647#0@0[lng]TSRN appending 1 values
2023-11-08 08:45:56 M_DEBUG ALGO client3 gdk/gdk_bat.c:292 COLnew2 -> tmp_725#0@0[oid]TSRN
2023-11-08 08:45:56 M_DEBUG ALGO client3 gdk/gdk_batop.c:2846 BATconstant -> tmp_725#1@0[oid]TSRKN 23usec
核心函数:
calling sql.count
二. 分析 sql.count的执行细节:
调用堆栈:
#0 segs_end (segs=0x1952b230, tr=0x7f9fb0104450, table=0x1952aa60) at /root/work/monetdb-dev/trunk/monetdb/sql/storage/bat/bat_storage.c:457
#1 0x00007fa055d07ad1 in count_col (tr=0x7f9fb0104450, c=0x1952b2d0, access=10) at /root/work/monetdb-dev/trunk/monetdb/sql/storage/bat/bat_storage.c:773
#2 0x00007fa055c85186 in SQLbasecount (cntxt=0x13b4580, mb=0x7f9fb0135570, stk=0x7f9fb013c3e0, pci=0x7f9fb0144400) at /root/work/monetdb-dev/trunk/monetdb/sql/backends/monet5/sql_rank.c:1289
#3 0x00007fa069395007 in runMALsequence (cntxt=0x13b4580, mb=0x7f9fb0135570, startpc=1, stoppc=0, stk=0x7f9fb013c3e0, env=0x0, pcicaller=0x0)
at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_interpreter.c:688
#4 0x00007fa06939377e in runMAL (cntxt=0x13b4580, mb=0x7f9fb0135570, mbcaller=0x0, env=0x0) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_interpreter.c:357
#5 0x00007fa055bb85eb in SQLrun (c=0x13b4580, m=0x7f9fb01167e0) at /root/work/monetdb-dev/trunk/monetdb/sql/backends/monet5/sql_execute.c:259
#6 0x00007fa055bb9ee7 in SQLengineIntern (c=0x13b4580, be=0x7f9fb0136060) at /root/work/monetdb-dev/trunk/monetdb/sql/backends/monet5/sql_execute.c:709
#7 0x00007fa055bb74b5 in SQLengine (c=0x13b4580) at /root/work/monetdb-dev/trunk/monetdb/sql/backends/monet5/sql_scenario.c:1358
#8 0x00007fa0693b4862 in runPhase (c=0x13b4580, phase=4) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_scenario.c:453
#9 0x00007fa0693b49cc in runScenarioBody (c=0x13b4580, once=0) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_scenario.c:479
#10 0x00007fa0693b4bd8 in runScenario (c=0x13b4580, once=0) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_scenario.c:510
#11 0x00007fa0693b6fea in MSserveClient (c=0x13b4580) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_session.c:589
#12 0x00007fa0693b6863 in MSscheduleClient (command=0x7f9fb0000b70 '\333' <repeats 199 times>, <incomplete sequence \333>..., challenge=0x7f9ff7bfcce3 "gWPRtbcO", fin=0x7f9fb0002b90,
fout=0x7f9fc4009630, protocol=PROTOCOL_9, blocksize=8190) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/mal/mal_session.c:445
#13 0x00007fa06947c1d4 in doChallenge (data=0x7f9fc4006790) at /root/work/monetdb-dev/trunk/monetdb/monetdb5/modules/mal/mal_mapi.c:222
#14 0x00007fa068d2e729 in THRstarter (a=0x7f9fc400bb20) at /root/work/monetdb-dev/trunk/monetdb/gdk/gdk_utils.c:1668
#15 0x00007fa068dabb23 in thread_starter (arg=0x7f9fc400bb90) at /root/work/monetdb-dev/trunk/monetdb/gdk/gdk_system.c:862
#16 0x00007fa0682ec1ca in start_thread () from /lib64/libpthread.so.0
#17 0x00007fa067f58e73 in clone () from /lib64/libc.so.6
核心函数:
segs_end
static size_t
segs_end( segments *segs, sql_trans *tr, sql_table *table)
{
size_t cnt = 0;
lock_table(tr->store, table->base.id);
segment *s = segs->h, *l = NULL;
if (segs->t && SEG_IS_VALID(segs->t, tr))
l = s = segs->t;
for(;s; s = s->next) {
if (SEG_IS_VALID(s, tr))
l = s;
}
if (l)
cnt = l->end;
unlock_table(tr->store, table->base.id);
return cnt;
}
SEG_IS_VALID
/* A segment is part of the current transaction is someway or is deleted by some other transaction but use to be valid */
#define SEG_IS_VALID(seg, tr) \
((!seg->deleted && VALID_4_READ(seg->ts, tr)) || \
(seg->deleted && OLD_VALID_4_READ(seg->ts, seg->oldts, tr)))
VALID_4_READ
/* valid
* !deleted && VALID_4_READ(TS, tr) existing or newly created segment
* deleted && TS > tr->ts && OLDTS < tr->ts deleted after current transaction
*/
#define VALID_4_READ(TS,tr) \
(TS == tr->tid || (tr->parent && tr_version_of_parent(tr, TS)) || TS < tr->ts)
核心数据:
拿到的segment:
(gdb) p l[0]
$47 = {
start = 0,
end = 25,
deleted = false,
ts = 1,
oldts = 0,
next = 0x7f9fbc027ed0,
prev = 0x0
}
未拿到的segment:
(gdb) p segs->t[0]
$46 = {
start = 25,
end = 26,
deleted = false,
ts = 1875,
oldts = 0,
next = 0x0,
prev = 0x0
}
三. 分析过滤segment的代码逻辑:
核心处理:
/* valid
* !deleted && VALID_4_READ(TS, tr) existing or newly created segment
* deleted && TS > tr->ts && OLDTS < tr->ts deleted after current transaction
*/
#define VALID_4_READ(TS,tr) \
(TS == tr->tid || (tr->parent && tr_version_of_parent(tr, TS)) || TS < tr->ts)
核心数据结构:
sql_trans
typedef struct sql_trans {
char *name;
ulng ts; /* transaction start timestamp */
ulng tid; /* transaction id */
sql_store store; /* keep link into the global store */
MT_Lock lock; /* lock protecting concurrent writes to the changes list */
list *changes; /* list of changes */
list *dropped; /* protection against recursive cascade action*/
list *predicates; /* list of read predicates logged during update transactions */
list *dependencies; /* list of dependencies created (list of sqlids from the objects) */
list *depchanges; /* list of dependencies changed (it would be tested for conflicts at the end of the transaction) */
lng logchanges; /* count number of changes to be applied to the wal */
int active; /* is active transaction */
int status; /* status of the last query */
sql_catalog *cat;
sql_schema *tmp; /* each session has its own tmp schema */
changeset localtmps;
sql_allocator *sa; /* transaction allocator */
struct sql_trans *parent; /* multilevel transaction support */
} sql_trans;
segment
typedef struct segment {
BUN start;
BUN end;
bool deleted; /* we need to keep a dense segment set, 0 - end of last segemnt,
some segments maybe deleted */
ulng ts; /* timestamp on this segment, ie tid of some active transaction or commit time of append/delete or
rollback time, ie ready for reuse */
ulng oldts; /* keep previous ts, for rollbacks */
struct segment *next; /* usualy one should be enough */
struct segment *prev; /* used in destruction list */
} segment;
核心判断逻辑:
1. (TS == tr->tid)
- TS 是 seg->ts
- tid是transaction id, 也就是事务id /* transaction id */
- tr->ts 是 /* transaction start timestamp */
- seg->ts 和 tr->tid 进行比较,其实使用了一些hack技巧和成员复用
2. (TS < tr->ts)
- 重点是这个逻辑
- tr->ts 是这个事务开始的时间
- TS 是 seg->ts, 是这个数据段被提交的时间
- 所以这个判断的逻辑, 就是tr这个事务, 只能看到这个开始之前被记录的数据段里的数据
- 也就说, 从tr这个事务开始后, 再被提交的事务, 都无法被看到
- 从调用方的角度, 就是 read repeated 的隔离级别
如何扩充新的隔离级别呢?
- 注意我说的是扩充新的隔离级别,而不是修改已有的隔离级别
- RR的隔离级别需要被保留
- 使用mysql的隔离级别的参数来控制
- 新增RC的隔离级别
- RC的隔离级别, 可以看到已经提交的事务的数据, 而不仅仅是 TS < tr->ts 的