这是pg的主从备份,就是有一个主机把修改发给一个或多个副机,pg主从机之间是通过log来实现的,而不是采用简单的SQL,
主机是sender,从机是receiver,就是进程,有多少个receiver,主机就有多少个进程,
默认是异步的,就是主机不停的从硬盘上log中读取,之后发给副机,副机把收到的log应用到数据库中,主机中正在执行的事务是与发送没关系的,目前还没有实现从内存的log buffer上面读log传给副机
主从通过socket进行通信
//在主机sender部分,在内享内存里面,
/* There is one WalSndCtl struct for the whole database cluster */
typedef struct
{
/*
* Synchronous replication queue with one queue per request type.
* Protected by SyncRepLock.
*/
SHM_QUEUE SyncRepQueue[NUM_SYNC_REP_WAIT_MODE];
/*
* Current location of the head of the queue. All waiters should have a
* waitLSN that follows this value. Protected by SyncRepLock.
*/
XLogRecPtr lsn[NUM_SYNC_REP_WAIT_MODE];
/*
* Are any sync standbys defined? Waiting backends can't reload the
* config file safely, so checkpointer updates this value as needed.
* Protected by SyncRepLock.
*/
bool sync_standbys_defined;
WalSnd walsnds[1]; /* VARIABLE LENGTH ARRAY */
} WalSndCtlData;
//在主机sender部分
/*
* Each walsender has a WalSnd struct in shared memory.
*/
typedef struct WalSnd
{
pid_t pid; /* this walsender's process id, or 0 */
WalSndState state; /* this walsender's state */
XLogRecPtr sentPtr; /* WAL has been sent up to this point */
bool needreload; /* does currently-open file need to be reloaded? */
/*
* The xlog locations that have been written, flushed, and applied by
* standby-side. These may be invalid if the standby-side has not offered
* values yet.
*/
XLogRecPtr write;//这三个是从副机传过来的,表示已发送日志的位置,全是相对于log而言的
XLogRecPtr flush;//表明是fsync到硬盘上面了
XLogRecPtr apply;//副机已把接收到的log,应用到数据库的位置
/* Protects shared variables shown above. */
slock_t mutex;
/*
* Latch used by backends to wake up this walsender when it has work to
* do.
*/
Latch latch;
/*
* The priority order of the standby managed by this WALSender, as listed
* in synchronous_standby_names, or 0 if not-listed. Protected by
* SyncRepLock.
*/
int sync_standby_priority;
} WalSnd;
//每个sender的状态
typedef enum WalSndState
{
WALSNDSTATE_STARTUP = 0,
WALSNDSTATE_BACKUP,
WALSNDSTATE_CATCHUP,
WALSNDSTATE_STREAMING
} WalSndState;
/*每个传送log的最大值
* Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ.
*
* We don't have a good idea of what a good value would be; there's some
* overhead per message in both walsender and walreceiver, but on the other
* hand sending large batches makes walsender less responsive to signals
* because signals are checked only between messages. 128kB (with
* default 8k blocks) seems like a reasonable guess for now.
*/
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)
/*在副机的receive,在共享内存中也存在数据结构,和状态
* Values for WalRcv->walRcvState.
*/
typedef enum
{
WALRCV_STOPPED, /* stopped and mustn't start up again */
WALRCV_STARTING, /* launched, but the process hasn't
* initialized yet */
WALRCV_RUNNING, /* walreceiver is running */
WALRCV_STOPPING /* requested to stop, but still running */
} WalRcvState;
/* Shared memory area for management of walreceiver process */
typedef struct
{
/*
* PID of currently active walreceiver process, its current state and
* start time (actually, the time at which it was requested to be
* started).
*/
pid_t pid;
WalRcvState walRcvState;
pg_time_t startTime;
/*
* receiveStart is the first byte position that will be received. When
* startup process starts the walreceiver, it sets receiveStart to the
* point where it wants the streaming to begin.
*/
XLogRecPtr receiveStart;
/*
* receivedUpto-1 is the last byte position that has already been
* received. At the first startup of walreceiver, receivedUpto is set to
* receiveStart. After that, walreceiver updates this whenever it flushes
* the received WAL to disk.
*/
XLogRecPtr receivedUpto;
/*
* latestChunkStart is the starting byte position of the current "batch"
* of received WAL. It's actually the same as the previous value of
* receivedUpto before the last flush to disk. Startup process can use
* this to detect whether it's keeping up or not.
*/
XLogRecPtr latestChunkStart;
/*
* Time of send and receive of any message received.
*/
TimestampTz lastMsgSendTime;
TimestampTz lastMsgReceiptTime;
/*
* connection string; is used for walreceiver to connect with the primary.
*/
char conninfo[MAXCONNINFO];
slock_t mutex; /* locks shared variables shown above */
} WalRcvData;
//主从机发送的log信息: 'w' + WalDataMessageHeader + MAX_SEND_SIZE的log
/*
* Header for a WAL data message (message type 'w'). This is wrapped within
* a CopyData message at the FE/BE protocol level.
*
* The header is followed by actual WAL data. Note that the data length is
* not specified in the header --- it's just whatever remains in the message.
*
* walEnd and sendTime are not essential data, but are provided in case
* the receiver wants to adjust its behavior depending on how far behind
* it is.
*/
typedef struct
{
/* WAL start location of the data included in this message */
XLogRecPtr dataStart;
/* Current end of WAL on the sender */
XLogRecPtr walEnd;
/* Sender's system clock at the time of transmission */
TimestampTz sendTime;
} WalDataMessageHeader;
//通信协议
/*
* All messages from WalSender must contain these fields to allow us to
* correctly calculate the replication delay.
*/
typedef struct
{
/* Current end of WAL on the sender */
XLogRecPtr walEnd;
/* Sender's system clock at the time of transmission */
TimestampTz sendTime;
} WalSndrMessage;
/*
* Keepalive message from primary (message type 'k'). (lowercase k)
* This is wrapped within a CopyData message at the FE/BE protocol level.
*
* Note that the data length is not specified here.
*/
typedef WalSndrMessage PrimaryKeepaliveMessage;
/*
* Reply message from standby (message type 'r'). This is wrapped within
* a CopyData message at the FE/BE protocol level.
*
* Note that the data length is not specified here.
*/
typedef struct
{
/*
* The xlog locations that have been written, flushed, and applied by
* standby-side. These may be invalid if the standby-side is unable to or
* chooses not to report these.
*/
XLogRecPtr write;
XLogRecPtr flush;
XLogRecPtr apply;
/* Sender's system clock at the time of transmission */
TimestampTz sendTime;
} StandbyReplyMessage;
/*
* Hot Standby feedback from standby (message type 'h'). This is wrapped within
* a CopyData message at the FE/BE protocol level.
*
* Note that the data length is not specified here.
*/
typedef struct
{
/*
* The current xmin and epoch from the standby, for Hot Standby feedback.
* This may be invalid if the standby-side does not support feedback, or
* Hot Standby is not yet available.
*/
TransactionId xmin;
uint32 epoch;
/* Sender's system clock at the time of transmission */
TimestampTz sendTime;
} StandbyHSFeedbackMessage;
/*
receiver端接收:
'w' : 就是log数据
'k' : 向主机发送keepalive信息,就是PrimaryKeepaliveMessage
receiver端发送:
'r' : 有数据时,告知主机log接收状况,就是StandbyReplyMessage
'h' : 无数据时,就是StandbyHSFeedbackMessage
sender端接收和发送,就是对应receiver端
*/
502

被折叠的 条评论
为什么被折叠?



