【事务处理】
事务处理主要算法就是两阶段提交,显然并不是所有的操作都需要两阶段提交的。触发两阶段提交的条件通过 IsTwoPhaseCommitRequired(bool localWrite) 来判断。
/*
* Returns true if 2PC is required for consistent commit: if there was write
* activity on two or more nodes within current transaction.
*/
bool
IsTwoPhaseCommitRequired(bool localWrite){// #lizard forgives
PGXCNodeAllHandles *handles = NULL;
bool found = localWrite;
int i = 0;
#ifdef __TBASE__
int sock_fatal_count = 0;
#endif/* Never run 2PC on Datanode-to-Datanode connection */
if (IS_PGXC_DATANODE)
return false;if (MyXactFlags & XACT_FLAGS_ACCESSEDTEMPREL)
{
elog(DEBUG1, "Transaction accessed temporary objects - "
"2PC will not be used and that can lead to data inconsistencies "
"in case of failures");
return false;
}/*
* If no XID assigned, no need to run 2PC since neither coordinator nor any
* remote nodes did write operation
*/
if (!TransactionIdIsValid(GetTopTransactionIdIfAny()))
return false;#ifdef __TBASE__
handles = get_sock_fatal_handles();
sock_fatal_count = handles->dn_conn_count + handles->co_conn_count;for (i = 0; i < handles->dn_conn_count; i++)
{
PGXCNodeHandle *conn = handles->datanode_handles[i];elog(LOG, "IsTwoPhaseCommitRequired, fatal_conn=%p, fatal_conn->nodename=%s, fatal_conn->sock=%d, "
"fatal_conn->read_only=%d, fatal_conn->transaction_status=%c, "
"fatal_conn->sock_fatal_occurred=%d, conn->backend_pid=%d, fatal_conn->error=%s",
conn, conn->nodename, conn->sock, conn->read_only, conn->transaction_status,
conn->sock_fatal_occurred, conn->backend_pid, conn->error);
}for (i = 0; i < handles->co_conn_count; i++)
{
PGXCNodeHandle *conn = handles->coord_handles[i];elog(LOG, "IsTwoPhaseCommitRequired, fatal_conn=%p, fatal_conn->nodename=%s, fatal_conn->sock=%d, "
"fatal_conn->read_only=%d, fatal_conn->transaction_status=%c, "
"fatal_conn->sock_fatal_occurred=%d, conn->backend_pid=%d, fatal_conn->error=%s",
conn, conn->nodename, conn->sock, conn->read_only, conn->transaction_status,
conn->sock_fatal_occurred, conn->backend_pid, conn->error);
}
pfree_pgxc_all_handles(handles);if (sock_fatal_count != 0)
{
elog(ERROR, "IsTwoPhaseCommitRequired, Found %d sock fatal handles exist", sock_fatal_count);
}
#endif
/* get current transaction handles that we register when pgxc_node_begin */
handles = get_current_txn_handles();
for (i = 0; i < handles->dn_conn_count; i++)
{
PGXCNodeHandle *conn = handles->datanode_handles[i];#ifdef __TBASE__
elog(DEBUG5, "IsTwoPhaseCommitRequired, conn->nodename=%s, conn->sock=%d, conn->read_only=%d, conn->transaction_status=%c",
conn->nodename, conn->sock, conn->read_only, conn->transaction_status);
#endif
if (conn->sock == NO_SOCKET)
{
elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s's connection handle is invalid, backend_pid: %d",
conn->nodename, conn->backend_pid);
}
else if (!conn->read_only && conn->transaction_status == 'T')
{
if (found)
{
pfree_pgxc_all_handles(handles);
return true; /* second found */
}
else
{
found = true; /* first found */
}
}
else if (conn->transaction_status == 'E')
{
elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s is in error state, backend_pid: %d",
conn->nodename, conn->backend_pid);
}
}
for (i = 0; i < handles->co_conn_count; i++)
{
PGXCNodeHandle *conn = handles->coord_handles[i];#ifdef __TBASE__
elog(DEBUG5, "IsTwoPhaseCommitRequired, conn->nodename=%s, conn->sock=%d, conn->read_only=%d, conn->transaction_status=%c",
conn->nodename, conn->sock, conn->read_only, conn->transaction_status);
#endif
if (conn->sock == NO_SOCKET)
{
elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s's connection handle is invalid, backend_pid: %d",
conn->nodename, conn->backend_pid);
}
else if (!conn->read_only && conn->transaction_status == 'T')
{
if (found)
{
pfree_pgxc_all_handles(handles);
return true; /* second found */
}
else
{
found = true; /* first found */
}
}
else if (conn->transaction_status == 'E')
{
elog(ERROR, "IsTwoPhaseCommitRequired, remote node %s is in error state, backend_pid: %d",
conn->nodename, conn->backend_pid);
}
}
pfree_pgxc_all_handles(handles);#ifdef __TBASE__
elog(DEBUG5, "IsTwoPhaseCommitRequired return false");
#endifreturn false;
}
关于两阶段提交, 下面这段注释说明是否需要两阶段提交的判断 逻辑,相关的代码,一方面在backend/pgxc/pool里,另一方面在PG正常的事务处理中。
/*
* Do pre-commit processing for remote nodes which includes Datanodes and
* Coordinators. If more than one nodes are involved in the transaction write
* activity, then we must run 2PC. For 2PC, we do the following steps:
*
* 1. PREPARE the transaction locally if the local node is involved in the
* transaction. If local node is not involved, skip this step and go to the
* next step
* 2. PREPARE the transaction on all the remote nodes. If any node fails to
* PREPARE, directly go to step 6
* 3. Now that all the involved nodes are PREPAREd, we can commit the
* transaction. We first inform the GTM that the transaction is fully
* PREPARED and also supply the list of the nodes involved in the
* transaction
* 4. COMMIT PREPARED the transaction on all the remotes nodes and then
* finally COMMIT PREPARED on the local node if its involved in the
* transaction and start a new transaction so that normal commit processing
* works unchanged. Go to step 5.
* 5. Return and let the normal commit processing resume
* 6. Abort by ereporting the error and let normal abort-processing take
* charge.
*/
【 create table 为示例】
Coordinator 节点先在自己节点上创建这个表(包括更新 pg_class 缓存,创建物理文件等),再向各节点 dispatch 命令 。
在结束事务时(finish_xact_command->CommitTransactionCommand)触发二阶段提交:
接口调用 CommitTransactionCommand =》 CommitTransaction =》PreCommit_Remote =》SetDataRowForExtParams =》IsTwoPhaseCommitRequired
【第一阶段】
prepareTransaction
1) 检查序列化冲突 (PreCommit_CheckForSerializationFailure)
2)向Datanode发送信息 “Distributed Prepare” AddRemoteQueryNode(stmts, queryString, is_local
? EXEC_ON_NONE
: (is_temp ? EXEC_ON_DATANODES : EXEC_ON_ALL_NODES));
3)Datanode 收到后走 PostgreMain PrepareTransaction 逻辑
执行 on commit 的操作:
关闭大对象的表
检查死锁
标记事务状态为 preparing
写 XLOG_XACT_PREPARE 的事务日志,创建状态文件 (StartPrepare->EndPrepare)
标记本进程没有事务了
释放占用的资源(buffer,cache,pgstat ,缓存的 database 的旧记录,事务id,锁,谓词锁),发送invalidate 高速缓存条目的消息
清除分配的帐户与上下文内存释放 resource group
回到循环的 ready for query 状态, 回复 Coordinator
【第二阶段】
RecordTransactionCommit
1) Coordinator 插入 XLOG_XACT_COMMIT 的事务日志
2)将日志刷盘 XLogFlush
3)写 clog 日志 TransactionIdCommitTree(先提交子事务,再提交总事务)
4)向 Datanode 发送 “Distributed Commit Prepared”命令
5)Datanode 走 pgxc_node_remote_prepare逻辑,开启一个事务命令,进入 FinishPreparedTransaction :
记录 XLOG_XACT_COMMIT_PREPARED 事务日志 (RecordTransactionCommitPrepared)
将日志刷盘 XLogFlush
写分布式 clog (SetCommittedTree)
等待mirror 节点 SyncRepWaitForLSN
从进程队列中移除此进程。
删除要删的database 目录,表文件
释放二阶段提交相关的锁 lock_twophase_postcommit
释放谓词锁