replication

这是pg的主从备份,就是有一个主机把修改发给一个或多个副机,pg主从机之间是通过log来实现的,而不是采用简单的SQL,

主机是sender,从机是receiver,就是进程,有多少个receiver,主机就有多少个进程,

默认是异步的,就是主机不停的从硬盘上log中读取,之后发给副机,副机把收到的log应用到数据库中,主机中正在执行的事务是与发送没关系的,目前还没有实现从内存的log buffer上面读log传给副机

主从通过socket进行通信

//在主机sender部分,在内享内存里面,
/* There is one WalSndCtl struct for the whole database cluster */
typedef struct
{
	/*
	 * Synchronous replication queue with one queue per request type.
	 * Protected by SyncRepLock.
	 */
	SHM_QUEUE	SyncRepQueue[NUM_SYNC_REP_WAIT_MODE];

	/*
	 * Current location of the head of the queue. All waiters should have a
	 * waitLSN that follows this value. Protected by SyncRepLock.
	 */
	XLogRecPtr	lsn[NUM_SYNC_REP_WAIT_MODE];

	/*
	 * Are any sync standbys defined?  Waiting backends can't reload the
	 * config file safely, so checkpointer updates this value as needed.
	 * Protected by SyncRepLock.
	 */
	bool		sync_standbys_defined;

	WalSnd		walsnds[1];		/* VARIABLE LENGTH ARRAY */
} WalSndCtlData;


//在主机sender部分
/*
 * Each walsender has a WalSnd struct in shared memory.
 */
typedef struct WalSnd
{
	pid_t		pid;			/* this walsender's process id, or 0 */
	WalSndState state;			/* this walsender's state */
	XLogRecPtr	sentPtr;		/* WAL has been sent up to this point */
	bool		needreload;		/* does currently-open file need to be reloaded? */

	/*
	 * The xlog locations that have been written, flushed, and applied by
	 * standby-side. These may be invalid if the standby-side has not offered
	 * values yet.
	 */
	XLogRecPtr	write;//这三个是从副机传过来的,表示已发送日志的位置,全是相对于log而言的
	XLogRecPtr	flush;//表明是fsync到硬盘上面了
	XLogRecPtr	apply;//副机已把接收到的log,应用到数据库的位置

	/* Protects shared variables shown above. */
	slock_t		mutex;

	/*
	 * Latch used by backends to wake up this walsender when it has work to
	 * do.
	 */
	Latch		latch;

	/*
	 * The priority order of the standby managed by this WALSender, as listed
	 * in synchronous_standby_names, or 0 if not-listed. Protected by
	 * SyncRepLock.
	 */
	int			sync_standby_priority;
} WalSnd;


//每个sender的状态


typedef enum WalSndState
{
 WALSNDSTATE_STARTUP = 0,
 WALSNDSTATE_BACKUP,
 WALSNDSTATE_CATCHUP,
 WALSNDSTATE_STREAMING
} WalSndState;
/*每个传送log的最大值
 * Maximum data payload in a WAL data message. Must be >= XLOG_BLCKSZ.
 *
 * We don't have a good idea of what a good value would be; there's some
 * overhead per message in both walsender and walreceiver, but on the other
 * hand sending large batches makes walsender less responsive to signals
 * because signals are checked only between messages.  128kB (with
 * default 8k blocks) seems like a reasonable guess for now.
 */
#define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)


 

 

/*在副机的receive,在共享内存中也存在数据结构,和状态
 * Values for WalRcv->walRcvState.
 */
typedef enum
{
	WALRCV_STOPPED,				/* stopped and mustn't start up again */
	WALRCV_STARTING,			/* launched, but the process hasn't
								 * initialized yet */
	WALRCV_RUNNING,				/* walreceiver is running */
	WALRCV_STOPPING				/* requested to stop, but still running */
} WalRcvState;

/* Shared memory area for management of walreceiver process */
typedef struct
{
	/*
	 * PID of currently active walreceiver process, its current state and
	 * start time (actually, the time at which it was requested to be
	 * started).
	 */
	pid_t		pid;
	WalRcvState walRcvState;
	pg_time_t	startTime;

	/*
	 * receiveStart is the first byte position that will be received. When
	 * startup process starts the walreceiver, it sets receiveStart to the
	 * point where it wants the streaming to begin.
	 */
	XLogRecPtr	receiveStart;

	/*
	 * receivedUpto-1 is the last byte position that has already been
	 * received.  At the first startup of walreceiver, receivedUpto is set to
	 * receiveStart. After that, walreceiver updates this whenever it flushes
	 * the received WAL to disk.
	 */
	XLogRecPtr	receivedUpto;

	/*
	 * latestChunkStart is the starting byte position of the current "batch"
	 * of received WAL.  It's actually the same as the previous value of
	 * receivedUpto before the last flush to disk.	Startup process can use
	 * this to detect whether it's keeping up or not.
	 */
	XLogRecPtr	latestChunkStart;

	/*
	 * Time of send and receive of any message received.
	 */
	TimestampTz lastMsgSendTime;
	TimestampTz lastMsgReceiptTime;

	/*
	 * connection string; is used for walreceiver to connect with the primary.
	 */
	char		conninfo[MAXCONNINFO];

	slock_t		mutex;			/* locks shared variables shown above */
} WalRcvData;


 

//主从机发送的log信息: 'w' + WalDataMessageHeader + MAX_SEND_SIZE的log
/*
 * Header for a WAL data message (message type 'w').  This is wrapped within
 * a CopyData message at the FE/BE protocol level.
 *
 * The header is followed by actual WAL data.  Note that the data length is
 * not specified in the header --- it's just whatever remains in the message.
 *
 * walEnd and sendTime are not essential data, but are provided in case
 * the receiver wants to adjust its behavior depending on how far behind
 * it is.
 */
typedef struct
{
	/* WAL start location of the data included in this message */
	XLogRecPtr	dataStart;

	/* Current end of WAL on the sender */
	XLogRecPtr	walEnd;

	/* Sender's system clock at the time of transmission */
	TimestampTz sendTime;
} WalDataMessageHeader;


 

//通信协议

/*
 * All messages from WalSender must contain these fields to allow us to
 * correctly calculate the replication delay.
 */
typedef struct
{
	/* Current end of WAL on the sender */
	XLogRecPtr	walEnd;

	/* Sender's system clock at the time of transmission */
	TimestampTz sendTime;
} WalSndrMessage;

/*
 * Keepalive message from primary (message type 'k'). (lowercase k)
 * This is wrapped within a CopyData message at the FE/BE protocol level.
 *
 * Note that the data length is not specified here.
 */
typedef WalSndrMessage	PrimaryKeepaliveMessage;



/*
 * Reply message from standby (message type 'r').  This is wrapped within
 * a CopyData message at the FE/BE protocol level.
 *
 * Note that the data length is not specified here.
 */
typedef struct
{
	/*
	 * The xlog locations that have been written, flushed, and applied by
	 * standby-side. These may be invalid if the standby-side is unable to or
	 * chooses not to report these.
	 */
	XLogRecPtr	write;
	XLogRecPtr	flush;
	XLogRecPtr	apply;

	/* Sender's system clock at the time of transmission */
	TimestampTz sendTime;
} StandbyReplyMessage;



/*
 * Hot Standby feedback from standby (message type 'h').  This is wrapped within
 * a CopyData message at the FE/BE protocol level.
 *
 * Note that the data length is not specified here.
 */
typedef struct
{
	/*
	 * The current xmin and epoch from the standby, for Hot Standby feedback.
	 * This may be invalid if the standby-side does not support feedback, or
	 * Hot Standby is not yet available.
	 */
	TransactionId xmin;
	uint32		epoch;

	/* Sender's system clock at the time of transmission */
	TimestampTz sendTime;
} StandbyHSFeedbackMessage;

/*
receiver端接收:
'w' : 就是log数据
'k' : 向主机发送keepalive信息,就是PrimaryKeepaliveMessage
receiver端发送:
'r' : 有数据时,告知主机log接收状况,就是StandbyReplyMessage
'h' : 无数据时,就是StandbyHSFeedbackMessage


sender端接收和发送,就是对应receiver端

*/

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值