(本文中凡是未显式指出的SQL,均在协调节点上执行)
工作节点
mydb1=# SELECT * FROM master_get_active_worker_nodes();
node_name | node_port
---------------+-----------
192.168.7.131 | 5432
192.168.7.135 | 5432
192.168.7.136 | 5432
192.168.7.137 | 5432
192.168.7.133 | 5432
192.168.7.132 | 5432
192.168.7.134 | 5432
192.168.7.130 | 5432
(8 rows)
创建表test_table
create table test_table(id int, name varchar(16));
配置分片原则
SELECT master_create_distributed_table('test_table', 'id', 'hash');
根据分片数和副本数进行分片
SELECT master_create_worker_shards('test_table', 8, 2);
查看分片
mydb1=# SELECT * from pg_dist_shard order by shardid;
logicalrelid | shardid | shardstorage | shardminvalue | shardmaxvalue
--------------+---------+--------------+---------------+---------------
test_table | 102032 | t | -2147483648 | -1610612737
test_table | 102033 | t | -1610612736 | -1073741825
test_table | 102034 | t | -1073741824 | -536870913
test_table | 102035 | t | -536870912 | -1
test_table | 102036 | t | 0 | 536870911
test_table | 102037 | t | 536870912 | 1073741823
test_table | 102038 | t | 1073741824 | 1610612735
test_table | 102039 | t | 1610612736 | 2147483647
(8 rows)
可见一共有8个分片。
查看分片分布
mydb1=# SELECT * from pg_dist_shard_placement order by shardid, placementid;
shardid | shardstate | shardlength | nodename | nodeport | placementid
---------+------------+-------------+---------------+----------+-------------
102032 | 1 | 0 | 192.168.7.130 | 5432 | 33
102032 | 1 | 0 | 192.168.7.131 | 5432 | 34
102033 | 1 | 0 | 192.168.7.131 | 5432 | 35
102033 | 1 | 0 | 192.168.7.132 | 5432 | 36
102034 | 1 | 0 | 192.168.7.132 | 5432 | 37
102034 | 1 | 0 | 192.168.7.133 | 5432 | 38
102035 | 1 | 0 | 192.168.7.133 | 5432 | 39
102035 | 1 | 0 | 192.168.7.134 | 5432 | 40
102036 | 1 | 0 | 192.168.7.134 | 5432 | 41
102036 | 1 | 0 | 192.168.7.135 | 5432 | 42
102037 | 1 | 0 | 192.168.7.135 | 5432 | 43
102037 | 1 | 0 | 192.168.7.136 | 5432 | 44
102038 | 1 | 0 | 192.168.7.136 | 5432 | 45
102038 | 1 | 0 | 192.168.7.137 | 5432 | 46
102039 | 1 | 0 | 192.168.7.137 | 5432 | 47
102039 | 1 | 0 | 192.168.7.130 | 5432 | 48
(16 rows)
可见每个分片有2个副本,分布在相邻的不同工作节点上。
插入8条记录
mydb1=# select * from test_table order by id;
id | name
----+------
1 | a
2 | b
3 | c
4 | d
5 | e
6 | f
7 | g
8 | h
(8 rows)
在工作节点上查询分片内的数据
在节点192.168.7.130和节点192.168.7.131上查询分片102032(及其副本),查询结果相同。
mydb1=# select * from test_table_102032;
id | name
----+------
1 | a
8 | h
(2 rows)
直接向工作节点写数据(故意)造成数据不同步
在节点192.168.7.130上执行:
mydb1=# INSERT INTO test_table_102032 VALUES(111,'111');
INSERT 0 1
mydb1=# select * from test_table_102032;
id | name
-----+------
1 | a
8 | h
111 | 111
(3 rows)
在节点192.168.7.131上执行:
mydb1=# INSERT INTO test_table_102032 VALUES(222,'222');
INSERT 0 1
mydb1=# select * from test_table_102032;
id | name
-----+------
1 | a
8 | h
222 | 222
(3 rows)
在协调节点上查看结果
mydb1=# select * from test_table order by id;
id | name
-----+------
1 | a
2 | b
3 | c
4 | d
5 | e
6 | f
7 | g
8 | h
111 | 111
(9 rows)
可以判断:协调节点通常只从主工作节点取数据。
人为拔出“主工作节点”网线
mydb1=# select * from test_table order by id;
WARNING: could not establish asynchronous connection after 5000 ms
id | name
-----+------
1 | a
2 | b
3 | c
4 | d
5 | e
6 | f
7 | g
8 | h
222 | 222
(9 rows)
可以判断:当无法从主工作节点(192.168.7.130)获取数据时,协调节点会从副本工作节点(192.168.7.131)取数据。
将主工作节点网络恢复后,再次查询
mydb1=# select * from test_table order by id;
id | name
-----+------
1 | a
2 | b
3 | c
4 | d
5 | e
6 | f
7 | g
8 | h
111 | 111
(9 rows)
可以判断:协调节点自动切回了主工作节点
在工作节点掉线的过程中,如果不发生涉及掉线节点的写操作,分片信息和分片分布信息未发生变化。(只涉及其他节点的写操作,没有影响)
mydb1=# INSERT INTO test_table VALUES(99,'99');
INSERT 0 1
mydb1=# SELECT * from pg_dist_shard order by shardid;
logicalrelid | shardid | shardstorage | shardminvalue | shardmaxvalue
--------------+---------+--------------+---------------+---------------
test_table | 102032 | t | -2147483648 | -1610612737
test_table | 102033 | t | -1610612736 | -1073741825
test_table | 102034 | t | -1073741824 | -536870913
test_table | 102035 | t | -536870912 | -1
test_table | 102036 | t | 0 | 536870911
test_table | 102037 | t | 536870912 | 1073741823
test_table | 102038 | t | 1073741824 | 1610612735
test_table | 102039 | t | 1610612736 | 2147483647
(8 rows)
mydb1=# SELECT * from pg_dist_shard_placement order by shardid, placementid;
shardid | shardstate | shardlength | nodename | nodeport | placementid
---------+------------+-------------+---------------+----------+-------------
102032 | 1 | 0 | 192.168.7.130 | 5432 | 33
102032 | 1 | 0 | 192.168.7.131 | 5432 | 34
102033 | 1 | 0 | 192.168.7.131 | 5432 | 35
102033 | 1 | 0 | 192.168.7.132 | 5432 | 36
102034 | 1 | 0 | 192.168.7.132 | 5432 | 37
102034 | 1 | 0 | 192.168.7.133 | 5432 | 38
102035 | 1 | 0 | 192.168.7.133 | 5432 | 39
102035 | 1 | 0 | 192.168.7.134 | 5432 | 40
102036 | 1 | 0 | 192.168.7.134 | 5432 | 41
102036 | 1 | 0 | 192.168.7.135 | 5432 | 42
102037 | 1 | 0 | 192.168.7.135 | 5432 | 43
102037 | 1 | 0 | 192.168.7.136 | 5432 | 44
102038 | 1 | 0 | 192.168.7.136 | 5432 | 45
102038 | 1 | 0 | 192.168.7.137 | 5432 | 46
102039 | 1 | 0 | 192.168.7.137 | 5432 | 47
102039 | 1 | 0 | 192.168.7.130 | 5432 | 48
(16 rows)
在工作节点掉线的过程中,如果发生了涉及掉线节点的写操作,分片分布信息中“分片状态”发生了变化。(从1变成3)
mydb1=# INSERT INTO test_table VALUES(1,'1111111');
WARNING: connection error: 192.168.7.130:5432
DETAIL: could not send data to server: No route to host
could not send SSL negotiation packet: No route to host
INSERT 0 1
mydb1=# SELECT * from pg_dist_shard order by shardid;
logicalrelid | shardid | shardstorage | shardminvalue | shardmaxvalue
--------------+---------+--------------+---------------+---------------
test_table | 102032 | t | -2147483648 | -1610612737
test_table | 102033 | t | -1610612736 | -1073741825
test_table | 102034 | t | -1073741824 | -536870913
test_table | 102035 | t | -536870912 | -1
test_table | 102036 | t | 0 | 536870911
test_table | 102037 | t | 536870912 | 1073741823
test_table | 102038 | t | 1073741824 | 1610612735
test_table | 102039 | t | 1610612736 | 2147483647
(8 rows)
mydb1=# SELECT * from pg_dist_shard_placement order by shardid, placementid;
shardid | shardstate | shardlength | nodename | nodeport | placementid
---------+------------+-------------+---------------+----------+-------------
102032 | 3 | 0 | 192.168.7.130 | 5432 | 33
102032 | 1 | 0 | 192.168.7.131 | 5432 | 34
102033 | 1 | 0 | 192.168.7.131 | 5432 | 35
102033 | 1 | 0 | 192.168.7.132 | 5432 | 36
102034 | 1 | 0 | 192.168.7.132 | 5432 | 37
102034 | 1 | 0 | 192.168.7.133 | 5432 | 38
102035 | 1 | 0 | 192.168.7.133 | 5432 | 39
102035 | 1 | 0 | 192.168.7.134 | 5432 | 40
102036 | 1 | 0 | 192.168.7.134 | 5432 | 41
102036 | 1 | 0 | 192.168.7.135 | 5432 | 42
102037 | 1 | 0 | 192.168.7.135 | 5432 | 43
102037 | 1 | 0 | 192.168.7.136 | 5432 | 44
102038 | 1 | 0 | 192.168.7.136 | 5432 | 45
102038 | 1 | 0 | 192.168.7.137 | 5432 | 46
102039 | 1 | 0 | 192.168.7.137 | 5432 | 47
102039 | 1 | 0 | 192.168.7.130 | 5432 | 48
(16 rows)
此时再恢复原“主工作节点”,发现标记并未恢复;且协调节点仍会从原先的“副本工作节点”取得数据。
在节点192.168.7.130上执行:
mydb1=# select * from test_table_102032 order by id;
id | name
-----+------
1 | a
8 | h
111 | 111
(3 rows)
缺少记录 (1, ‘1111111’)
在节点192.168.7.131上执行:
mydb1=# select * from test_table_102032 order by id;
id | name
-----+---------
1 | a
1 | 1111111
8 | h
222 | 222
(4 rows)
在协调节点上执行:
mydb1=# select * from test_table order by id;
id | name
-----+---------
1 | a
1 | 1111111
2 | b
3 | c
4 | d
5 | e
6 | f
7 | g
8 | h
99 | 99
222 | 222 <-----可见是从131上取的数据
(11 rows)
查看分片分布状态:
mydb1=# SELECT * from pg_dist_shard_placement order by shardid, placementid;
shardid | shardstate | shardlength | nodename | nodeport | placementid
---------+------------+-------------+---------------+----------+-------------
102032 | 3 | 0 | 192.168.7.130 | 5432 | 33
102032 | 1 | 0 | 192.168.7.131 | 5432 | 34
102033 | 1 | 0 | 192.168.7.131 | 5432 | 35
102033 | 1 | 0 | 192.168.7.132 | 5432 | 36
102034 | 1 | 0 | 192.168.7.132 | 5432 | 37
102034 | 1 | 0 | 192.168.7.133 | 5432 | 38
102035 | 1 | 0 | 192.168.7.133 | 5432 | 39
102035 | 1 | 0 | 192.168.7.134 | 5432 | 40
102036 | 1 | 0 | 192.168.7.134 | 5432 | 41
102036 | 1 | 0 | 192.168.7.135 | 5432 | 42
102037 | 1 | 0 | 192.168.7.135 | 5432 | 43
102037 | 1 | 0 | 192.168.7.136 | 5432 | 44
102038 | 1 | 0 | 192.168.7.136 | 5432 | 45
102038 | 1 | 0 | 192.168.7.137 | 5432 | 46
102039 | 1 | 0 | 192.168.7.137 | 5432 | 47
102039 | 1 | 0 | 192.168.7.130 | 5432 | 48
(16 rows)