本文基于 Java 11+实现
构建可靠的分布式系统时,一致性问题是核心挑战之一。ZooKeeper 的 ZAB 协议和 Paxos 算法作为两种主流解决方案,在理论基础和工程实现上各有特点。本文深入分析它们的实现机制、性能特性和最佳实践。
一、基本概念
ZAB 协议
ZAB (ZooKeeper Atomic Broadcast) 是专为 ZooKeeper 设计的分布式一致性协议,核心目标是保证分布式系统中数据更新的原子性和顺序一致性。
Paxos 算法
Paxos 是 Leslie Lamport 提出的通用分布式一致性算法,是众多分布式系统的理论基础,解决的是在不可靠网络中如何达成共识的问题。
二、ZAB 协议实现
ZAB 协议工作在两种模式下:
- 恢复模式:系统启动或 Leader 崩溃时触发
- 广播模式:正常运行时处理写请求
核心接口定义
public interface ZabProcessor {
// 恢复模式接口
boolean startRecovery() throws RecoveryException;
// 广播模式接口
CompletableFuture<Boolean> processWrite(Request request);
CompletableFuture<Result> processRead(String key, ConsistencyLevel level);
// 状态查询接口
boolean isLeader();
long getCurrentEpoch();
}
public interface NetworkClient {
// 基础网络通信接口
void connect(String serverId, String address, int port) throws IOException;
void disconnect(String serverId);
// ZAB协议消息
ACK sendProposal(String serverId, ProposalPacket proposal) throws IOException;
void sendCommit(String serverId, CommitPacket commit) throws IOException;
LastZxidResponse sendEpochRequest(String serverId, EpochPacket epochPkt) throws IOException;
boolean sendTruncate(String serverId, TruncatePacket truncPkt) throws IOException;
boolean sendTransactions(String serverId, List<Transaction> txns) throws IOException;
boolean sendNewLeader(String serverId, NewLeaderPacket newLeaderPkt) throws IOException;
void sendHeartbeat(String serverId, long zxid) throws IOException;
void sendSnapshot(String serverId, byte[] snapshot, long zxid) throws IOException;
}
public interface StateMachine {
void apply(long zxid, byte[] command) throws Exception;
long getLastAppliedZxid();
byte[] takeSnapshot() throws Exception;
void restoreSnapshot(byte[] snapshot, long zxid) throws Exception;
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
ZAB 恢复模式实现
public class ZABRecovery {
private final AtomicLong zxid = new AtomicLong(0);
private final AtomicInteger epoch = new AtomicInteger(0);
private volatile ServerState state = ServerState.LOOKING;
private final Logger logger = LoggerFactory.getLogger(ZABRecovery.class);
private final ConcurrentMap<String, ServerData> serverDataMap;
private final int quorumSize;
private final NetworkClient networkClient;
private final StateMachine stateMachine;
private final String serverId;
// 构造函数
public ZABRecovery(String serverId, int quorumSize, NetworkClient networkClient,
StateMachine stateMachine) {
this.serverId = serverId;
this.quorumSize = quorumSize;
this.networkClient = networkClient;
this.stateMachine = stateMachine;
this.serverDataMap = new ConcurrentHashMap<>();
}
// Leader恢复流程
public boolean startRecovery() throws RecoveryException {
MDC.put("component", "zab-recovery");
MDC.put("serverId", serverId);
try {
// 1. 更新选举轮次
int newEpoch = epoch.incrementAndGet();
logger.info("Starting recovery with epoch: {}", newEpoch);
// 2. 发现阶段:收集所有Follower状态
Map<Long, Set<String>> commitMap = discoverFollowerStates();
// 3. 确定截断点和提交点
long truncateZxid = determineMaxCommittedZxid(commitMap);
logger.info("Determined truncate zxid: {}", Long.toHexString(truncateZxid));
// 4. 解决可能的冲突(脑裂后)
resolveConflictsAfterPartition(truncateZxid, commitMap);
// 5. 同步阶段:将历史事务同步给Follower
syncFollowers(truncateZxid);
// 6. 切换到广播模式
state = ServerState.LEADING;
logger.info("Recovery completed, switching to broadcast mode");
return true;
} catch (IOException e) {
logger.error("Recovery failed due to I/O error", e);
state = ServerState.LOOKING;
throw new RecoveryException("I/O error during recovery", e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Recovery interrupted", e);
state = ServerState.LOOKING;
throw new RecoveryException("Recovery process interrupted", e);
} catch (Exception e) {
logger.error("Unexpected error during recovery", e);
state = ServerState.LOOKING;
throw new RecoveryException("Unexpected error during recovery", e);
} finally {
MDC.remove("component");
MDC.remove("serverId");
}
}
// 发现阶段:收集所有Follower的最新事务信息
private Map<Long, Set<String>> discoverFollowerStates() throws IOException, InterruptedException {
Map<Long, Set<String>> acceptedZxids = new ConcurrentHashMap<>();
CountDownLatch latch = new CountDownLatch(serverDataMap.size());
List<CompletableFuture<?>> futures = new ArrayList<>();
// 向所有Follower发送CEPOCH消息
for (var entry : serverDataMap.entrySet()) {
final String targetServerId = entry.getKey();
final ServerData serverData = entry.getValue();
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
MDC.put("targetServerId", targetServerId);
try {
// 发送新的epoch
EpochPacket epochPkt = new EpochPacket(epoch.get());
LastZxidResponse response = networkClient.sendEpochRequest(
targetServerId, epochPkt);
// 记录该服务器的最新zxid
synchronized (acceptedZxids) {
acceptedZxids.computeIfAbsent(response.getLastZxid(), k -> new HashSet<>())
.add(targetServerId);
}
logger.info("Server {} last zxid: {}", targetServerId,
Long.toHexString(response.getLastZxid()));
} catch (IOException e) {
logger.error("Failed to discover state from server: {}", targetServerId, e);
} finally {
MDC.remove("targetServerId");
latch.countDown();
}
});
futures.add(future);
}
// 等待大多数响应或超时
if (!latch.await(10, TimeUnit.SECONDS)) {
logger.warn("Discovery phase timed out, proceeding with available responses");
}
// 取消未完成的任务
for (CompletableFuture<?> future : futures) {
if (!future.isDone()) {
future.cancel(true);
}
}
return acceptedZxids;
}
// 确定需要保留的最大已提交事务ID
private long determineMaxCommittedZxid(Map<Long, Set<String>> commitMap) {
// 寻找被多数派确认的最大ZXID
long maxZxid = 0;
int quorum = getQuorum();
for (var entry : commitMap.entrySet()) {
if (entry.getValue().size() >= quorum && entry.getKey() > maxZxid) {
maxZxid = entry.getKey();
}
}
return maxZxid;
}
// 解决网络分区后可能的数据冲突
private void resolveConflictsAfterPartition(long truncateZxid,
Map<Long, Set<String>> commitMap) {
logger.info("Checking for potential conflicts after network partition");
// 1. 识别潜在冲突事务 - 那些不在多数派中的更高zxid
List<ConflictingTransaction> conflicts = new ArrayList<>();
for (var entry : commitMap.entrySet()) {
long txnZxid = entry.getKey();
Set<String> servers = entry.getValue();
// 如果zxid大于已确定的截断点,但不是多数派确认的
if (txnZxid > truncateZxid && servers.size() < getQuorum()) {
// 获取事务的epoch
int txnEpoch = ZxidUtils.getEpochFromZxid(txnZxid);
int truncateEpoch = ZxidUtils.getEpochFromZxid(truncateZxid);
conflicts.add(new ConflictingTransaction(txnZxid, truncateZxid,
txnEpoch, truncateEpoch,
servers));
}
}
// 2. 处理冲突
if (!conflicts.isEmpty()) {
logger.warn("Found {} potential conflicting transactions after partition",
conflicts.size());
for (ConflictingTransaction conflict : conflicts) {
if (conflict.isFromHigherEpoch()) {
logger.warn("Conflict: transaction with zxid {} from higher epoch {} " +
"found but not in majority. Will be discarded.",
Long.toHexString(conflict.getConflictZxid()),
conflict.getConflictEpoch());
} else {
logger.warn("Conflict: transaction with zxid {} from same epoch {} " +
"found but not in majority. Will be discarded.",
Long.toHexString(conflict.getConflictZxid()),
conflict.getConflictEpoch());
}
// 通知这些服务器截断这些事务
notifyServersToTruncate(conflict.getServers(), truncateZxid);
}
} else {
logger.info("No conflicting transactions found");
}
}
// 通知服务器截断超出安全点的事务
private void notifyServersToTruncate(Set<String> servers, long truncateZxid) {
for (String serverId : servers) {
CompletableFuture.runAsync(() -> {
try {
TruncatePacket truncPkt = new TruncatePacket(truncateZxid);
boolean success = networkClient.sendTruncate(serverId, truncPkt);
if (success) {
logger.info("Successfully notified server {} to truncate to zxid {}",
serverId, Long.toHexString(truncateZxid));
} else {
logger.warn("Failed to notify server {} to truncate", serverId);
}
} catch (IOException e) {
logger.error("Error notifying server {} to truncate", serverId, e);
}
});
}
}
// 同步阶段:将历史事务同步给Follower
private void syncFollowers(long truncateZxid) throws IOException, InterruptedException {
// 获取从truncateZxid开始的所有事务
List<Transaction> txns = loadTransactionsFromLog(truncateZxid);
logger.info("Syncing {} transactions to followers", txns.size());
// 并行同步给所有Follower
CountDownLatch syncLatch = new CountDownLatch(serverDataMap.size());
AtomicInteger successCount = new AtomicInteger(0);
List<CompletableFuture<?>> futures = new ArrayList<>();
for (var entry : serverDataMap.entrySet()) {
final String targetServerId = entry.getKey();
final ServerData serverData = entry.getValue();
CompletableFuture<Void> future = CompletableFuture.runAsync(() -> {
MDC.put("targetServerId", targetServerId);
try {
// 检查Follower是否需要使用快照追赶
long followerZxid = serverData.getLastZxid();
if (truncateZxid - followerZxid > SNAPSHOT_THRESHOLD) {
syncFollowerWithSnapshot(targetServerId, followerZxid);
} else {
// 1. 发送TRUNC命令,通知Follower截断日志
TruncatePacket truncPkt = new TruncatePacket(truncateZxid);
if (networkClient.sendTruncate(targetServerId, truncPkt)) {
// 2. 发送DIFF命令,同步缺失的事务
if (networkClient.sendTransactions(targetServerId, txns)) {
// 3. 发送NEWLEADER命令,确认同步完成
NewLeaderPacket newLeaderPkt = new NewLeaderPacket(epoch.get());
if (networkClient.sendNewLeader(targetServerId, newLeaderPkt)) {
// 同步成功
successCount.incrementAndGet();
logger.info("Successfully synced server: {}", targetServerId);
}
}
}
}
} catch (IOException e) {
logger.error("Failed to sync server {} with {} transactions, last zxid: {}",
targetServerId, txns.size(), Long.toHexString(truncateZxid), e);
} finally {
MDC.remove("targetServerId");
syncLatch.countDown();
}
});
futures.add(future);
}
// 等待同步完成或超时
if (!syncLatch.await(30, TimeUnit.SECONDS)) {
logger.warn("Sync phase timed out");
}
// 取消未完成的任务
for (CompletableFuture<?> future : futures) {
if (!future.isDone()) {
future.cancel(true);
}
}
// 检查是否有足够的服务器同步成功
if (successCount.get() < quorumSize) {
throw new QuorumNotFoundException("Failed to sync with quorum of followers",
successCount.get(), quorumSize);
}
}
// 使用快照同步落后太多的Follower
private void syncFollowerWithSnapshot(String followerId, long followerZxid) throws IOException {
try {
logger.info("Follower {} is too far behind (zxid: {}), syncing with snapshot",
followerId, Long.toHexString(followerZxid));
// 1. 获取当前状态快照
byte[] snapshot = stateMachine.takeSnapshot();
// 2. 发送快照给Follower
networkClient.sendSnapshot(followerId, snapshot, zxid.get());
logger.info("Successfully sent snapshot to follower: {}", followerId);
} catch (Exception e) {
logger.error("Failed to sync follower {} with snapshot", followerId, e);
throw new IOException("Snapshot sync failed", e);
}
}
// 从事务日志加载事务
private List<Transaction> loadTransactionsFromLog(long fromZxid) throws IOException {
List<Transaction> result = new ArrayList<>();
// 实际实现会从持久化存储读取事务记录
logger.info("Loading transactions starting from zxid: {}", Long.toHexString(fromZxid));
return result;
}
private int getQuorum() {
return quorumSize / 2 + 1;
}
// 常量定义
private static final long SNAPSHOT_THRESHOLD = 100000; // 事务差距超过10万时使用快照
// 冲突事务数据结构
static class ConflictingTransaction {
private final long conflictZxid;
private final long truncateZxid;
private final int conflictEpoch;
private final int truncateEpoch;
private final Set<String> servers;
public ConflictingTransaction(long conflictZxid, long truncateZxid,
int conflictEpoch, int truncateEpoch,
Set<String> servers) {
this.conflictZxid = conflictZxid;
this.truncateZxid = truncateZxid;
this.conflictEpoch = conflictEpoch;
this.truncateEpoch = truncateEpoch;
this.servers = new HashSet<>(servers);
}
public boolean isFromHigherEpoch() {
return conflictEpoch > truncateEpoch;
}
public long getConflictZxid() {
return conflictZxid;
}
public int getConflictEpoch() {
return conflictEpoch;
}
public Set<String> getServers() {
return Collections.unmodifiableSet(servers);
}
}
// 其他内部类定义...
enum ServerState {
LOOKING, // 寻找Leader
FOLLOWING, // Follower角色
LEADING // Leader角色
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
ZAB 广播模式实现
public class ZABBroadcast implements AutoCloseable {
private final AtomicLong zxid;
private final AtomicInteger epoch;
private final ConcurrentMap<String, ServerData> followers;
private final Logger logger = LoggerFactory.getLogger(ZABBroadcast.class);
private final CircuitBreaker circuitBreaker;
private final NetworkClient networkClient;
private final StateMachine stateMachine;
private final String serverId;
private final ReadWriteLock rwLock = new ReentrantReadWriteLock();
private final ScheduledExecutorService scheduler;
private final MetricsCollector metrics;
private final RateLimiter heartbeatLogLimiter = RateLimiter.create(0.1); // 每10秒最多一条日志
public ZABBroadcast(String serverId, AtomicLong zxid, AtomicInteger epoch,
NetworkClient networkClient, StateMachine stateMachine) {
this.serverId = serverId;
this.zxid = zxid;
this.epoch = epoch;
this.networkClient = networkClient;
this.stateMachine = stateMachine;
this.followers = new ConcurrentHashMap<>();
this.circuitBreaker = new CircuitBreaker(5, 10000); // 5次失败,10秒重置
this.scheduler = Executors.newScheduledThreadPool(2, r -> {
Thread t = new Thread(r, "zab-scheduler-" + serverId);
t.setDaemon(true);
return t;
});
this.metrics = new MetricsCollector("zab_broadcast");
// 启动心跳任务
scheduler.scheduleWithFixedDelay(this::sendHeartbeats,
500, 500, TimeUnit.MILLISECONDS);
}
// 添加Follower
public void addFollower(ServerData follower) {
followers.put(follower.getId(), follower);
}
// Leader处理写请求
public CompletableFuture<Boolean> processWrite(Request request) {
Stopwatch stopwatch = Stopwatch.createStarted();
MDC.put("component", "zab-broadcast");
MDC.put("serverId", serverId);
MDC.put("requestId", request.getId());
try {
return GlobalExceptionHandler.withExceptionHandling(
circuitBreaker.execute(() -> {
try {
// 1. 为请求生成zxid (高32位是epoch,低32位是计数器)
long newZxid = createNewZxid();
MDC.put("zxid", Long.toHexString(newZxid));
logger.info("Processing write request: {} with zxid: {}",
request.getId(), Long.toHexString(newZxid));
// 2. 将请求发送给所有Follower
List<Future<ACK>> futures = sendToFollowers(request, newZxid);
// 3. 等待过半Follower的ACK
if (waitForMajority(futures)) {
// 4. 通知所有Follower提交事务
commit(newZxid);
logger.info("Request {} committed successfully", request.getId());
// 5. 记录指标
metrics.recordSuccessfulWrite(stopwatch.elapsed(TimeUnit.MILLISECONDS));
return CompletableFuture.completedFuture(true);
} else {
logger.warn("Failed to get majority ACKs for request {}", request.getId());
metrics.recordFailedWrite();
return CompletableFuture.completedFuture(false);
}
} catch (IOException e) {
logger.error("Failed to process write request: {}", request.getId(), e);
metrics.recordFailedWrite();
return CompletableFuture.failedFuture(
new ProcessingException("Failed to process write request", e));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while processing write request: {}", request.getId(), e);
metrics.recordFailedWrite();
return CompletableFuture.failedFuture(
new ProcessingException("Interrupted during write processing", e));
}
})
);
} catch (CircuitBreakerOpenException e) {
logger.error("Circuit breaker is open, rejecting request: {}", request.getId());
metrics.recordRejectedWrite();
return CompletableFuture.failedFuture(
new ProcessingException("Circuit breaker open, system overloaded", e));
} finally {
MDC.remove("component");
MDC.remove("serverId");
MDC.remove("requestId");
MDC.remove("zxid");
}
}
// 处理批量写请求,提高吞吐量
public CompletableFuture<Map<String, Boolean>> processBatchWrite(List<Request> requests) {
if (requests.isEmpty()) {
return CompletableFuture.completedFuture(Collections.emptyMap());
}
Stopwatch stopwatch = Stopwatch.createStarted();
MDC.put("component", "zab-broadcast");
MDC.put("serverId", serverId);
MDC.put("batchSize", String.valueOf(requests.size()));
try {
return GlobalExceptionHandler.withExceptionHandling(
circuitBreaker.execute(() -> {
Map<String, Boolean> results = new HashMap<>();
try {
// 创建批处理包
BatchRequest batch = new BatchRequest();
for (Request req : requests) {
batch.addRequest(req);
results.put(req.getId(), false); // 默认失败
}
// 为批次生成一个zxid
long batchZxid = createNewZxid();
MDC.put("zxid", Long.toHexString(batchZxid));
logger.info("Processing batch of {} requests with zxid: {}",
requests.size(), Long.toHexString(batchZxid));
// 发送批处理请求给所有Follower
List<Future<ACK>> futures = sendBatchToFollowers(batch, batchZxid);
// 等待多数派确认
if (waitForMajority(futures)) {
// 提交批次
commitBatch(batchZxid);
logger.info("Batch with {} requests committed successfully", requests.size());
// 设置所有请求结果为成功
for (Request req : requests) {
results.put(req.getId(), true);
}
metrics.recordSuccessfulBatchWrite(
requests.size(), stopwatch.elapsed(TimeUnit.MILLISECONDS));
} else {
logger.warn("Failed to get majority ACKs for batch");
metrics.recordFailedBatchWrite(requests.size());
}
} catch (Exception e) {
logger.error("Error processing batch write of {} requests", requests.size(), e);
metrics.recordFailedBatchWrite(requests.size());
}
return CompletableFuture.completedFuture(results);
})
);
} catch (CircuitBreakerOpenException e) {
logger.error("Circuit breaker is open, rejecting batch of {} requests", requests.size());
metrics.recordRejectedBatchWrite(requests.size());
Map<String, Boolean> results = new HashMap<>();
for (Request req : requests) {
results.put(req.getId(), false);
}
return CompletableFuture.failedFuture(
new ProcessingException("Circuit breaker open, system overloaded", e));
} finally {
MDC.remove("component");
MDC.remove("serverId");
MDC.remove("batchSize");
MDC.remove("zxid");
}
}
// 读取操作的一致性保证
public CompletableFuture<Result> readWithConsistency(String key, ConsistencyLevel level) {
Stopwatch stopwatch = Stopwatch.createStarted();
MDC.put("component", "zab-broadcast");
MDC.put("serverId", serverId);
MDC.put("key", key);
MDC.put("consistency", level.name());
try {
ReadStrategy strategy = readStrategies.getOrDefault(
level, readStrategies.get(ConsistencyLevel.EVENTUAL));
CompletableFuture<Result> result = strategy.execute(key, this::readLocal);
result.thenAccept(r ->
metrics.recordRead(level, stopwatch.elapsed(TimeUnit.MILLISECONDS)));
return result;
} catch (Exception e) {
logger.error("Error performing {} read for key: {}", level, key, e);
metrics.recordFailedRead(level);
return CompletableFuture.failedFuture(
new ProcessingException("Read operation failed", e));
} finally {
MDC.remove("component");
MDC.remove("serverId");
MDC.remove("key");
MDC.remove("consistency");
}
}
// 本地读取数据
private Result readLocal(String key) {
rwLock.readLock().lock();
try {
// 实际实现会从本地数据库读取
return new Result(key, "value", true);
} finally {
rwLock.readLock().unlock();
}
}
// 生成新的zxid,处理溢出情况
private long createNewZxid() {
rwLock.writeLock().lock();
try {
long currentCounter = zxid.get() & 0xFFFFFFFFL;
// 检测溢出并处理
if (currentCounter >= 0xFFFFFFFFL) {
// 计数器即将溢出,增加epoch
int newEpoch = epoch.incrementAndGet();
logger.warn("ZXID counter overflow, incrementing epoch to {}", newEpoch);
long newZxid = ((long)newEpoch << 32); // 重置计数器
zxid.set(newZxid);
return newZxid;
}
return zxid.incrementAndGet();
} finally {
rwLock.writeLock().unlock();
}
}
// 发送提案给所有Follower
private List<Future<ACK>> sendToFollowers(Request request, long newZxid)
throws IOException {
List<Future<ACK>> futures = new ArrayList<>();
ProposalPacket proposal = new ProposalPacket(newZxid, request);
ExecutorService executor = Executors.newFixedThreadPool(followers.size(),
r -> {
Thread t = new Thread(r, "proposal-sender-" + serverId);
t.setDaemon(true);
return t;
});
try {
for (var entry : followers.entrySet()) {
final String targetServerId = entry.getKey();
futures.add(executor.submit(() -> {
MDC.put("targetServerId", targetServerId);
try {
ACK ack = networkClient.sendProposal(targetServerId, proposal);
logger.debug("Received ACK from {} for zxid {}",
targetServerId, Long.toHexString(newZxid));
return ack;
} catch (IOException e) {
logger.error("Failed to send proposal to follower {}, zxid: {}",
targetServerId, Long.toHexString(newZxid), e);
return null;
} finally {
MDC.remove("targetServerId");
}
}));
}
} finally {
executor.shutdown();
try {
if (!executor.awaitTermination(200, TimeUnit.MILLISECONDS)) {
List<Runnable> pendingTasks = executor.shutdownNow();
logger.warn("Force shutdown executor with {} pending tasks", pendingTasks.size());
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while waiting for executor to terminate");
}
}
return futures;
}
// 等待多数派响应
private boolean waitForMajority(List<Future<ACK>> futures)
throws InterruptedException {
int ackCount = 0;
int majority = (followers.size() / 2) + 1;
for (Future<ACK> future : futures) {
try {
ACK ack = future.get(5, TimeUnit.SECONDS);
if (ack != null && ack.isSuccess()) {
ackCount++;
if (ackCount >= majority) {
// 已获得多数派确认,可以提前返回
return true;
}
}
} catch (ExecutionException e) {
logger.warn("Error getting ACK", e.getCause());
} catch (TimeoutException e) {
logger.warn("Timeout waiting for ACK");
}
}
return ackCount >= majority;
}
// 通知所有Follower提交事务
private void commit(long zxid) throws IOException {
CommitPacket commit = new CommitPacket(zxid);
for (var entry : followers.entrySet()) {
final String targetServerId = entry.getKey();
CompletableFuture.runAsync(() -> {
MDC.put("targetServerId", targetServerId);
try {
networkClient.sendCommit(targetServerId, commit);
logger.debug("Sent commit to {} for zxid {}",
targetServerId, Long.toHexString(zxid));
} catch (IOException e) {
logger.error("Failed to send commit to follower {}, zxid: {}",
targetServerId, Long.toHexString(zxid), e);
} finally {
MDC.remove("targetServerId");
}
});
}
}
// 发送批处理请求
private List<Future<ACK>> sendBatchToFollowers(BatchRequest batch, long batchZxid)
throws IOException {
ProposalPacket proposal = new ProposalPacket(batchZxid, batch);
return sendProposalToFollowers(proposal, batchZxid);
}
// 提交批处理请求
private void commitBatch(long batchZxid) throws IOException {
commit(batchZxid);
}
// 发送心跳给所有Follower
private void sendHeartbeats() {
long currentZxid = zxid.get();
for (var entry : followers.entrySet()) {
final String targetServerId = entry.getKey();
CompletableFuture.runAsync(() -> {
try {
networkClient.sendHeartbeat(targetServerId, currentZxid);
} catch (IOException e) {
// 心跳失败,使用限流器避免日志泛滥
if (heartbeatLogLimiter.tryAcquire()) {
logger.debug("Failed to send heartbeat to {}", targetServerId, e);
}
}
});
}
}
// 发送提案给所有Follower(通用方法)
private List<Future<ACK>> sendProposalToFollowers(ProposalPacket proposal, long zxid)
throws IOException {
List<Future<ACK>> futures = new ArrayList<>();
ExecutorService executor = Executors.newFixedThreadPool(followers.size(),
r -> {
Thread t = new Thread(r, "proposal-sender-" + serverId);
t.setDaemon(true);
return t;
});
try {
for (var entry : followers.entrySet()) {
final String targetServerId = entry.getKey();
futures.add(executor.submit(() -> {
MDC.put("targetServerId", targetServerId);
try {
ACK ack = networkClient.sendProposal(targetServerId, proposal);
logger.debug("Received ACK from {} for zxid {}",
targetServerId, Long.toHexString(zxid));
return ack;
} catch (IOException e) {
logger.error("Failed to send proposal to follower {}, zxid: {}",
targetServerId, Long.toHexString(zxid), e);
return null;
} finally {
MDC.remove("targetServerId");
}
}));
}
} finally {
executor.shutdown();
try {
if (!executor.awaitTermination(200, TimeUnit.MILLISECONDS)) {
List<Runnable> pendingTasks = executor.shutdownNow();
logger.warn("Force shutdown executor with {} pending tasks", pendingTasks.size());
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while waiting for executor to terminate");
}
}
return futures;
}
// 定义读取策略接口和实现
private interface ReadStrategy {
CompletableFuture<Result> execute(String key, Supplier<Result> readFromLocal);
}
private final Map<ConsistencyLevel, ReadStrategy> readStrategies = new EnumMap<>(ConsistencyLevel.class);
{
// 初始化读取策略
readStrategies.put(ConsistencyLevel.LINEARIZABLE, new LinearizableReadStrategy());
readStrategies.put(ConsistencyLevel.SEQUENTIAL, new SequentialReadStrategy());
readStrategies.put(ConsistencyLevel.READ_YOUR_WRITES, new ReadYourWritesStrategy());
readStrategies.put(ConsistencyLevel.BOUNDED_STALENESS, new BoundedStalenessStrategy());
readStrategies.put(ConsistencyLevel.EVENTUAL, new EventualReadStrategy());
}
// 线性一致性读取策略
private class LinearizableReadStrategy implements ReadStrategy {
private final AtomicLong leaseExpirationTime = new AtomicLong(0);
private final long leaderLeaseMs = 5000; // 5秒租约
@Override
public CompletableFuture<Result> execute(String key, Supplier<Result> readFromLocal) {
// Leader需要确认自己仍然是Leader (租约机制)
if (System.currentTimeMillis() < leaseExpirationTime.get()) {
// 租约有效,可以安全读取
return CompletableFuture.completedFuture(readFromLocal.get());
} else {
// 租约过期,需要重新获取多数派确认
return renewLease().thenApply(renewed -> {
if (renewed) {
return readFromLocal.get();
} else {
throw new ConsistencyException("Cannot guarantee linearizable read");
}
});
}
}
private CompletableFuture<Boolean> renewLease() {
// 实际实现中,需要获取多数派确认
leaseExpirationTime.set(System.currentTimeMillis() + leaderLeaseMs);
logger.info("Renewed leader lease until {}", leaseExpirationTime.get());
return CompletableFuture.completedFuture(true);
}
}
// 顺序一致性读取策略
private class SequentialReadStrategy implements ReadStrategy {
@Override
public CompletableFuture<Result> execute(String key, Supplier<Result> readFromLocal) {
// 确保应用了所有已提交的事务
return ensureAppliedUpToDate()
.thenApply(v -> readFromLocal.get());
}
private CompletableFuture<Void> ensureAppliedUpToDate() {
// 实际实现会确保所有已提交的事务都已应用
logger.debug("Ensuring all committed transactions are applied");
return CompletableFuture.completedFuture(null);
}
}
// 读己所写策略
private class ReadYourWritesStrategy implements ReadStrategy {
private final ConcurrentMap<String, Long> writeTimestamps = new ConcurrentHashMap<>();
@Override
public CompletableFuture<Result> execute(String key, Supplier<Result> readFromLocal) {
// 检查是否有该key的写入记录
Long writeTime = writeTimestamps.get(key);
if (writeTime != null) {
// 确保经过足够时间,写入已经完成
long elapsed = System.currentTimeMillis() - writeTime;
if (elapsed < 100) { // 假设100ms足够写入完成
try {
Thread.sleep(100 - elapsed);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
return CompletableFuture.completedFuture(readFromLocal.get());
}
// 记录写入操作
public void recordWrite(String key) {
writeTimestamps.put(key, System.currentTimeMillis());
}
}
// 有界陈旧性策略
private class BoundedStalenessStrategy implements ReadStrategy {
private final ConcurrentMap<String, CacheEntry> cache = new ConcurrentHashMap<>();
private final long maxStalenessMs = 1000; // 最大陈旧时间1秒
@Override
public CompletableFuture<Result> execute(String key, Supplier<Result> readFromLocal) {
// 检查缓存
CacheEntry entry = cache.get(key);
if (entry != null) {
long age = System.currentTimeMillis() - entry.getTimestamp();
if (age <= maxStalenessMs) {
// 缓存未过期,直接返回
return CompletableFuture.completedFuture(entry.getResult());
}
}
// 缓存过期或不存在,从本地读取并更新缓存
Result result = readFromLocal.get();
cache.put(key, new CacheEntry(result, System.currentTimeMillis()));
return CompletableFuture.completedFuture(result);
}
// 定期清理过期缓存
public void cleanup() {
long now = System.currentTimeMillis();
cache.entrySet().removeIf(entry ->
now - entry.getValue().getTimestamp() > maxStalenessMs);
}
}
// 最终一致性策略
private class EventualReadStrategy implements ReadStrategy {
@Override
public CompletableFuture<Result> execute(String key, Supplier<Result> readFromLocal) {
// 直接从本地读取,不保证看到最新写入
return CompletableFuture.completedFuture(readFromLocal.get());
}
}
// 缓存条目
private static class CacheEntry {
private final Result result;
private final long timestamp;
public CacheEntry(Result result, long timestamp) {
this.result = result;
this.timestamp = timestamp;
}
public Result getResult() {
return result;
}
public long getTimestamp() {
return timestamp;
}
}
@Override
public void close() {
try {
List<Runnable> pendingTasks = scheduler.shutdownNow();
if (!pendingTasks.isEmpty()) {
logger.warn("Scheduler shutdown with {} pending tasks", pendingTasks.size());
}
if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) {
logger.warn("Scheduler did not terminate in time");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while waiting for scheduler termination");
}
}
// 断路器实现(更安全的版本)
static class CircuitBreaker {
private final AtomicReference<State> state = new AtomicReference<>(State.CLOSED);
private final AtomicLong failureCount = new AtomicLong(0);
private final AtomicLong lastFailureTime = new AtomicLong(0);
private final int threshold;
private final long resetTimeoutMs;
private final StampedLock stateLock = new StampedLock();
private final Logger logger = LoggerFactory.getLogger(CircuitBreaker.class);
public enum State { CLOSED, OPEN, HALF_OPEN }
public CircuitBreaker(int threshold, long resetTimeoutMs) {
this.threshold = threshold;
this.resetTimeoutMs = resetTimeoutMs;
}
public <T> CompletableFuture<T> execute(Supplier<CompletableFuture<T>> action)
throws CircuitBreakerOpenException {
State currentState = getCurrentState();
if (currentState == State.OPEN) {
// 检查是否应该尝试半开状态
if (System.currentTimeMillis() - lastFailureTime.get() > resetTimeoutMs) {
boolean transitioned = tryTransitionState(State.OPEN, State.HALF_OPEN);
if (!transitioned) {
throw new CircuitBreakerOpenException("Circuit breaker is open");
}
currentState = State.HALF_OPEN;
} else {
throw new CircuitBreakerOpenException("Circuit breaker is open");
}
}
final State executionState = currentState;
try {
CompletableFuture<T> future = action.get();
return future.handle((result, ex) -> {
if (ex != null) {
recordFailure();
throw new CompletionException(ex);
} else {
// 成功执行,重置失败计数
if (executionState == State.HALF_OPEN) {
tryTransitionState(State.HALF_OPEN, State.CLOSED);
}
failureCount.set(0);
return result;
}
});
} catch (Exception e) {
recordFailure();
throw e;
}
}
private void recordFailure() {
long stamp = stateLock.writeLock();
try {
long failures = failureCount.incrementAndGet();
lastFailureTime.set(System.currentTimeMillis());
if (failures >= threshold && state.get() == State.CLOSED) {
logger.warn("Circuit breaker opening after {} failures", failures);
state.set(State.OPEN);
}
} finally {
stateLock.unlockWrite(stamp);
}
}
private boolean tryTransitionState(State fromState, State toState) {
long stamp = stateLock.writeLock();
try {
if (state.get() == fromState) {
state.set(toState);
logger.info("Circuit breaker state changed from {} to {}", fromState, toState);
return true;
}
return false;
} finally {
stateLock.unlockWrite(stamp);
}
}
// 使用乐观读获取当前状态
public State getCurrentState() {
long stamp = stateLock.tryOptimisticRead();
State result = state.get();
if (!stateLock.validate(stamp)) {
stamp = stateLock.readLock();
try {
result = state.get();
} finally {
stateLock.unlockRead(stamp);
}
}
return result;
}
}
// 全局异常处理器
static class GlobalExceptionHandler {
private static final Logger logger = LoggerFactory.getLogger(GlobalExceptionHandler.class);
public static <T> CompletableFuture<T> withExceptionHandling(CompletableFuture<T> future) {
return future.exceptionally(e -> {
Throwable cause = e instanceof CompletionException ? e.getCause() : e;
if (cause instanceof ConsistencyException) {
logger.error("Consistency error: {}", cause.getMessage());
} else if (cause instanceof IOException) {
logger.error("I/O error: {}", cause.getMessage());
} else if (cause instanceof InterruptedException) {
Thread.currentThread().interrupt();
logger.warn("Operation interrupted");
} else {
logger.error("Unexpected error: {}", cause.getClass().getName(), cause);
}
throw new CompletionException(cause);
});
}
}
// 指标收集类
private static class MetricsCollector {
private final Counter writeRequests;
private final Counter writeSuccess;
private final Counter writeFailed;
private final Counter writeRejected;
private final Counter batchWrites;
private final Counter batchWriteRequests;
private final Counter readRequests;
private final Map<ConsistencyLevel, Counter> readsByLevel = new EnumMap<>(ConsistencyLevel.class);
private final Histogram writeLatency;
private final Histogram batchWriteLatency;
private final Map<ConsistencyLevel, Histogram> readLatency = new EnumMap<>(ConsistencyLevel.class);
public MetricsCollector(String prefix) {
this.writeRequests = Counter.build()
.name(prefix + "_write_requests_total")
.help("Total number of write requests").register();
this.writeSuccess = Counter.build()
.name(prefix + "_write_success_total")
.help("Total number of successful writes").register();
this.writeFailed = Counter.build()
.name(prefix + "_write_failed_total")
.help("Total number of failed writes").register();
this.writeRejected = Counter.build()
.name(prefix + "_write_rejected_total")
.help("Total number of rejected writes").register();
this.batchWrites = Counter.build()
.name(prefix + "_batch_writes_total")
.help("Total number of batch writes").register();
this.batchWriteRequests = Counter.build()
.name(prefix + "_batch_write_requests_total")
.help("Total number of requests in batch writes").register();
this.readRequests = Counter.build()
.name(prefix + "_read_requests_total")
.help("Total number of read requests").register();
this.writeLatency = Histogram.build()
.name(prefix + "_write_latency_ms")
.help("Write latency in milliseconds").register();
this.batchWriteLatency = Histogram.build()
.name(prefix + "_batch_write_latency_ms")
.help("Batch write latency in milliseconds").register();
// 初始化各一致性级别的计数器和直方图
for (ConsistencyLevel level : ConsistencyLevel.values()) {
readsByLevel.put(level, Counter.build()
.name(prefix + "_reads_" + level.name().toLowerCase() + "_total")
.help("Total " + level + " reads").register());
readLatency.put(level, Histogram.build()
.name(prefix + "_read_" + level.name().toLowerCase() + "_latency_ms")
.help(level + " read latency in milliseconds").register());
}
}
public void recordSuccessfulWrite(long latencyMs) {
writeRequests.inc();
writeSuccess.inc();
writeLatency.observe(latencyMs);
}
public void recordFailedWrite() {
writeRequests.inc();
writeFailed.inc();
}
public void recordRejectedWrite() {
writeRequests.inc();
writeRejected.inc();
}
public void recordSuccessfulBatchWrite(int batchSize, long latencyMs) {
batchWrites.inc();
batchWriteRequests.inc(batchSize);
writeRequests.inc(batchSize);
writeSuccess.inc(batchSize);
batchWriteLatency.observe(latencyMs);
}
public void recordFailedBatchWrite(int batchSize) {
batchWrites.inc();
batchWriteRequests.inc(batchSize);
writeRequests.inc(batchSize);
writeFailed.inc(batchSize);
}
public void recordRejectedBatchWrite(int batchSize) {
batchWrites.inc();
batchWriteRequests.inc(batchSize);
writeRequests.inc(batchSize);
writeRejected.inc(batchSize);
}
public void recordRead(ConsistencyLevel level, long latencyMs) {
readRequests.inc();
readsByLevel.get(level).inc();
readLatency.get(level).observe(latencyMs);
}
public void recordFailedRead(ConsistencyLevel level) {
readRequests.inc();
// 可以添加失败计数器
}
}
// 异常类
public static class CircuitBreakerOpenException extends Exception {
public CircuitBreakerOpenException(String message) {
super(message);
}
}
public static class ConsistencyException extends RuntimeException {
public ConsistencyException(String message) {
super(message);
}
}
public static class ProcessingException extends RuntimeException {
public ProcessingException(String message, Throwable cause) {
super(message, cause);
}
}
// 其他内部类和常量定义...
enum ConsistencyLevel {
LINEARIZABLE, // 线性一致性(最强)
SEQUENTIAL, // 顺序一致性
READ_YOUR_WRITES, // 读己所写
BOUNDED_STALENESS, // 有界陈旧性
EVENTUAL // 最终一致性(最弱)
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
- 350.
- 351.
- 352.
- 353.
- 354.
- 355.
- 356.
- 357.
- 358.
- 359.
- 360.
- 361.
- 362.
- 363.
- 364.
- 365.
- 366.
- 367.
- 368.
- 369.
- 370.
- 371.
- 372.
- 373.
- 374.
- 375.
- 376.
- 377.
- 378.
- 379.
- 380.
- 381.
- 382.
- 383.
- 384.
- 385.
- 386.
- 387.
- 388.
- 389.
- 390.
- 391.
- 392.
- 393.
- 394.
- 395.
- 396.
- 397.
- 398.
- 399.
- 400.
- 401.
- 402.
- 403.
- 404.
- 405.
- 406.
- 407.
- 408.
- 409.
- 410.
- 411.
- 412.
- 413.
- 414.
- 415.
- 416.
- 417.
- 418.
- 419.
- 420.
- 421.
- 422.
- 423.
- 424.
- 425.
- 426.
- 427.
- 428.
- 429.
- 430.
- 431.
- 432.
- 433.
- 434.
- 435.
- 436.
- 437.
- 438.
- 439.
- 440.
- 441.
- 442.
- 443.
- 444.
- 445.
- 446.
- 447.
- 448.
- 449.
- 450.
- 451.
- 452.
- 453.
- 454.
- 455.
- 456.
- 457.
- 458.
- 459.
- 460.
- 461.
- 462.
- 463.
- 464.
- 465.
- 466.
- 467.
- 468.
- 469.
- 470.
- 471.
- 472.
- 473.
- 474.
- 475.
- 476.
- 477.
- 478.
- 479.
- 480.
- 481.
- 482.
- 483.
- 484.
- 485.
- 486.
- 487.
- 488.
- 489.
- 490.
- 491.
- 492.
- 493.
- 494.
- 495.
- 496.
- 497.
- 498.
- 499.
- 500.
- 501.
- 502.
- 503.
- 504.
- 505.
- 506.
- 507.
- 508.
- 509.
- 510.
- 511.
- 512.
- 513.
- 514.
- 515.
- 516.
- 517.
- 518.
- 519.
- 520.
- 521.
- 522.
- 523.
- 524.
- 525.
- 526.
- 527.
- 528.
- 529.
- 530.
- 531.
- 532.
- 533.
- 534.
- 535.
- 536.
- 537.
- 538.
- 539.
- 540.
- 541.
- 542.
- 543.
- 544.
- 545.
- 546.
- 547.
- 548.
- 549.
- 550.
- 551.
- 552.
- 553.
- 554.
- 555.
- 556.
- 557.
- 558.
- 559.
- 560.
- 561.
- 562.
- 563.
- 564.
- 565.
- 566.
- 567.
- 568.
- 569.
- 570.
- 571.
- 572.
- 573.
- 574.
- 575.
- 576.
- 577.
- 578.
- 579.
- 580.
- 581.
- 582.
- 583.
- 584.
- 585.
- 586.
- 587.
- 588.
- 589.
- 590.
- 591.
- 592.
- 593.
- 594.
- 595.
- 596.
- 597.
- 598.
- 599.
- 600.
- 601.
- 602.
- 603.
- 604.
- 605.
- 606.
- 607.
- 608.
- 609.
- 610.
- 611.
- 612.
- 613.
- 614.
- 615.
- 616.
- 617.
- 618.
- 619.
- 620.
- 621.
- 622.
- 623.
- 624.
- 625.
- 626.
- 627.
- 628.
- 629.
- 630.
- 631.
- 632.
- 633.
- 634.
- 635.
- 636.
- 637.
- 638.
- 639.
- 640.
- 641.
- 642.
- 643.
- 644.
- 645.
- 646.
- 647.
- 648.
- 649.
- 650.
- 651.
- 652.
- 653.
- 654.
- 655.
- 656.
- 657.
- 658.
- 659.
- 660.
- 661.
- 662.
- 663.
- 664.
- 665.
- 666.
- 667.
- 668.
- 669.
- 670.
- 671.
- 672.
- 673.
- 674.
- 675.
- 676.
- 677.
- 678.
- 679.
- 680.
- 681.
- 682.
- 683.
- 684.
- 685.
- 686.
- 687.
- 688.
- 689.
- 690.
- 691.
- 692.
- 693.
- 694.
- 695.
- 696.
- 697.
- 698.
- 699.
- 700.
- 701.
- 702.
- 703.
- 704.
- 705.
- 706.
- 707.
- 708.
- 709.
- 710.
- 711.
- 712.
- 713.
- 714.
- 715.
- 716.
- 717.
- 718.
- 719.
- 720.
- 721.
- 722.
- 723.
- 724.
- 725.
- 726.
- 727.
- 728.
- 729.
- 730.
- 731.
- 732.
- 733.
- 734.
- 735.
- 736.
- 737.
- 738.
- 739.
- 740.
- 741.
- 742.
- 743.
- 744.
- 745.
- 746.
- 747.
- 748.
- 749.
- 750.
- 751.
- 752.
- 753.
- 754.
- 755.
- 756.
- 757.
- 758.
- 759.
- 760.
- 761.
- 762.
- 763.
- 764.
- 765.
- 766.
- 767.
- 768.
- 769.
- 770.
- 771.
- 772.
- 773.
- 774.
- 775.
- 776.
- 777.
- 778.
- 779.
- 780.
- 781.
- 782.
- 783.
- 784.
- 785.
- 786.
- 787.
- 788.
- 789.
- 790.
- 791.
- 792.
- 793.
- 794.
- 795.
- 796.
- 797.
- 798.
- 799.
- 800.
- 801.
- 802.
- 803.
- 804.
- 805.
- 806.
- 807.
- 808.
- 809.
- 810.
- 811.
- 812.
- 813.
- 814.
- 815.
- 816.
- 817.
- 818.
- 819.
- 820.
- 821.
- 822.
- 823.
- 824.
- 825.
- 826.
- 827.
- 828.
- 829.
- 830.
- 831.
- 832.
- 833.
- 834.
- 835.
- 836.
- 837.
- 838.
- 839.
- 840.
- 841.
- 842.
- 843.
- 844.
- 845.
- 846.
- 847.
- 848.
- 849.
- 850.
- 851.
- 852.
- 853.
Fast Leader Election 算法
public class FastLeaderElection {
private final AtomicLong logicalClock = new AtomicLong(0);
private final ConcurrentMap<String, Vote> receivedVotes = new ConcurrentHashMap<>();
private final String serverId;
private final NetworkManager networkManager;
private final int quorumSize;
private final AtomicInteger electionAttempts = new AtomicInteger(0);
private final Logger logger = LoggerFactory.getLogger(FastLeaderElection.class);
private final ZxidUtils zxidUtils;
public FastLeaderElection(String serverId, int quorumSize,
NetworkManager networkManager, ZxidUtils zxidUtils) {
this.serverId = serverId;
this.quorumSize = quorumSize;
this.networkManager = networkManager;
this.zxidUtils = zxidUtils;
}
public String lookForLeader() throws InterruptedException {
MDC.put("component", "fast-leader-election");
MDC.put("serverId", serverId);
try {
// 递增逻辑时钟
long newLogicalClock = logicalClock.incrementAndGet();
logger.info("Starting leader election with logical clock: {}", newLogicalClock);
// 初始化选票,投给自己
Vote vote = new Vote(serverId, zxidUtils.getLastZxid(), newLogicalClock);
receivedVotes.clear();
receivedVotes.put(serverId, vote);
// 向所有其他服务器发送选票
networkManager.broadcastVote(vote);
// 选举超时时间
long startTime = System.currentTimeMillis();
long maxTimeout = 60000; // 60秒最大超时
// 选举循环
Map<String, Integer> voteCounter = new HashMap<>();
String currentLeader = null;
while (System.currentTimeMillis() - startTime < maxTimeout) {
// 接收选票
Vote receivedVote = networkManager.receiveVote(200); // 200ms超时
if (receivedVote != null) {
MDC.put("candidateId", receivedVote.getServerId());
logger.debug("Received vote from {}: zxid={}, logicalClock={}",
receivedVote.getServerId(),
Long.toHexString(receivedVote.getZxid()),
receivedVote.getLogicalClock());
// 验证逻辑时钟
if (receivedVote.getLogicalClock() > newLogicalClock) {
// 发现更高的逻辑时钟,需要更新自己的时钟并重新开始选举
logicalClock.set(receivedVote.getLogicalClock());
logger.info("Found higher logical clock: {}, restarting election",
receivedVote.getLogicalClock());
MDC.remove("candidateId");
electionAttempts.set(0); // 重置尝试计数
return lookForLeader(); // 重新开始选举
} else if (receivedVote.getLogicalClock() < newLogicalClock) {
// 忽略旧的逻辑时钟选票
logger.debug("Ignoring vote with older logical clock: {}",
receivedVote.getLogicalClock());
MDC.remove("candidateId");
continue;
}
// 比较选票
int comparison = compareVotes(vote, receivedVote);
if (comparison < 0) {
// 收到更好的选票,更新自己的选票
vote = new Vote(receivedVote.getServerId(),
receivedVote.getZxid(),
newLogicalClock);
// 重新广播更新后的选票
networkManager.broadcastVote(vote);
logger.info("Updated vote to server: {}", vote.getServerId());
}
// 记录收到的选票
receivedVotes.put(receivedVote.getServerId(), receivedVote);
MDC.remove("candidateId");
// 统计票数
voteCounter.clear();
for (Vote v : receivedVotes.values()) {
String candidate = v.getServerId();
voteCounter.put(candidate, voteCounter.getOrDefault(candidate, 0) + 1);
// 检查是否有候选人获得多数派支持
if (voteCounter.get(candidate) >= quorumSize) {
currentLeader = candidate;
logger.info("Elected leader: {} with {} votes of {} required",
candidate, voteCounter.get(candidate), quorumSize);
break;
}
}
if (currentLeader != null) {
break; // 选出了Leader
}
}
}
if (currentLeader == null) {
// 处理选举失败,使用指数退避避免活锁
handleElectionFailure();
logger.warn("Failed to elect a leader, retrying...");
return lookForLeader(); // 重试
}
electionAttempts.set(0); // 重置尝试计数
return currentLeader;
} catch (Exception e) {
logger.error("Error during leader election", e);
// 增加选举尝试计数并退避
handleElectionFailure();
throw new LeaderElectionException("Leader election failed", e);
} finally {
MDC.remove("component");
MDC.remove("serverId");
}
}
// 处理选举失败,使用指数退避避免活锁
private void handleElectionFailure() {
int attempts = electionAttempts.incrementAndGet();
// 指数退避
int backoffMs = Math.min(1000 * (1 << Math.min(attempts, 10)), 30000);
// 添加随机抖动避免同步
backoffMs += ThreadLocalRandom.current().nextInt(backoffMs / 2);
logger.info("Election attempt {} failed, backing off for {}ms", attempts, backoffMs);
try {
Thread.sleep(backoffMs);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted during election backoff");
}
}
// 比较两个选票,返回负数表示v2更好,0表示相等,正数表示v1更好
private int compareVotes(Vote v1, Vote v2) {
// 首先比较zxid,更大的zxid具有更高优先级
long zxidDiff = ZxidUtils.compareZxid(v1.getZxid(), v2.getZxid());
if (zxidDiff != 0) {
return (int) Math.signum(zxidDiff);
}
// zxid相等,比较serverId
return v1.getServerId().compareTo(v2.getServerId());
}
// 内部类和工具方法...
static class Vote {
private final String serverId;
private final long zxid;
private final long logicalClock;
public Vote(String serverId, long zxid, long logicalClock) {
this.serverId = serverId;
this.zxid = zxid;
this.logicalClock = logicalClock;
}
public String getServerId() {
return serverId;
}
public long getZxid() {
return zxid;
}
public long getLogicalClock() {
return logicalClock;
}
@Override
public String toString() {
return "Vote{serverId='" + serverId + "', zxid=" + Long.toHexString(zxid) +
", logicalClock=" + logicalClock + '}';
}
}
// 自定义异常类
public static class LeaderElectionException extends RuntimeException {
public LeaderElectionException(String message, Throwable cause) {
super(message, cause);
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
网络客户端实现示例
public class NettyNetworkClient implements NetworkClient {
private final EventLoopGroup workerGroup;
private final Bootstrap bootstrap;
private final ConcurrentMap<String, Channel> channels = new ConcurrentHashMap<>();
private final int connectionTimeoutMs;
private final Logger logger = LoggerFactory.getLogger(NettyNetworkClient.class);
public NettyNetworkClient(int connectionTimeoutMs) {
this.connectionTimeoutMs = connectionTimeoutMs;
this.workerGroup = new NioEventLoopGroup();
this.bootstrap = new Bootstrap()
.group(workerGroup)
.channel(NioSocketChannel.class)
.option(ChannelOption.SO_KEEPALIVE, true)
.option(ChannelOption.TCP_NODELAY, true)
.option(ChannelOption.CONNECT_TIMEOUT_MILLIS, connectionTimeoutMs)
.handler(new ChannelInitializer<SocketChannel>() {
@Override
protected void initChannel(SocketChannel ch) {
ch.pipeline()
.addLast(new LengthFieldBasedFrameDecoder(1048576, 0, 4, 0, 4))
.addLast(new LengthFieldPrepender(4))
.addLast(new PacketEncoder())
.addLast(new PacketDecoder())
.addLast(new ClientHandler());
}
});
}
@Override
public void connect(String serverId, String address, int port) throws IOException {
try {
ChannelFuture future = bootstrap.connect(address, port);
boolean connected = future.await(connectionTimeoutMs, TimeUnit.MILLISECONDS);
if (!connected || !future.isSuccess()) {
throw new IOException("Failed to connect to " + serverId + " at " +
address + ":" + port);
}
channels.put(serverId, future.channel());
logger.info("Connected to server: {} at {}:{}", serverId, address, port);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Interrupted while connecting to " + serverId, e);
} catch (Exception e) {
throw new IOException("Failed to connect to " + serverId, e);
}
}
@Override
public void disconnect(String serverId) {
Channel channel = channels.remove(serverId);
if (channel != null) {
channel.close();
logger.info("Disconnected from server: {}", serverId);
}
}
@Override
public ACK sendProposal(String serverId, ProposalPacket proposal) throws IOException {
MDC.put("targetServerId", serverId);
try {
Channel channel = getChannel(serverId);
RequestFuture<ACK> future = new RequestFuture<>();
// 存储请求-响应映射
Long requestId = generateRequestId();
RequestRegistry.register(requestId, future);
// 包装请求
Request request = new Request(requestId, RequestType.PROPOSAL, proposal);
// 发送请求
channel.writeAndFlush(request).sync();
// 等待响应
ACK ack = future.get(5, TimeUnit.SECONDS);
if (ack == null) {
throw new IOException("Received null ACK from " + serverId);
}
return ack;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Interrupted while sending proposal to " + serverId, e);
} catch (TimeoutException e) {
throw new IOException("Timed out waiting for ACK from " + serverId, e);
} catch (ExecutionException e) {
throw new IOException("Error sending proposal to " + serverId, e.getCause());
} finally {
MDC.remove("targetServerId");
}
}
@Override
public void sendCommit(String serverId, CommitPacket commit) throws IOException {
MDC.put("targetServerId", serverId);
try {
Channel channel = getChannel(serverId);
// 包装请求
Request request = new Request(generateRequestId(), RequestType.COMMIT, commit);
// 发送请求 - 不等待响应
channel.writeAndFlush(request);
} catch (Exception e) {
throw new IOException("Error sending commit to " + serverId, e);
} finally {
MDC.remove("targetServerId");
}
}
@Override
public LastZxidResponse sendEpochRequest(String serverId, EpochPacket epochPkt)
throws IOException {
MDC.put("targetServerId", serverId);
try {
Channel channel = getChannel(serverId);
RequestFuture<LastZxidResponse> future = new RequestFuture<>();
// 存储请求-响应映射
Long requestId = generateRequestId();
RequestRegistry.register(requestId, future);
// 包装请求
Request request = new Request(requestId, RequestType.EPOCH, epochPkt);
// 发送请求
channel.writeAndFlush(request).sync();
// 等待响应
LastZxidResponse response = future.get(5, TimeUnit.SECONDS);
if (response == null) {
throw new IOException("Received null LastZxidResponse from " + serverId);
}
return response;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Interrupted while sending epoch request to " + serverId, e);
} catch (TimeoutException e) {
throw new IOException("Timed out waiting for LastZxidResponse from " + serverId, e);
} catch (ExecutionException e) {
throw new IOException("Error sending epoch request to " + serverId, e.getCause());
} finally {
MDC.remove("targetServerId");
}
}
// 实现其他接口方法...
@Override
public void sendSnapshot(String serverId, byte[] snapshot, long zxid) throws IOException {
MDC.put("targetServerId", serverId);
try {
Channel channel = getChannel(serverId);
// 由于快照可能很大,按块发送
int chunkSize = 1024 * 1024; // 1MB块
int totalChunks = (snapshot.length + chunkSize - 1) / chunkSize;
logger.info("Sending snapshot to {}, size: {} bytes, chunks: {}",
serverId, snapshot.length, totalChunks);
// 发送快照元数据
SnapshotMetadata metadata = new SnapshotMetadata(zxid, snapshot.length, totalChunks);
Request metadataRequest = new Request(generateRequestId(),
RequestType.SNAPSHOT_META, metadata);
channel.writeAndFlush(metadataRequest).sync();
// 分块发送快照数据
for (int i = 0; i < totalChunks; i++) {
int offset = i * chunkSize;
int length = Math.min(chunkSize, snapshot.length - offset);
byte[] chunk = new byte[length];
System.arraycopy(snapshot, offset, chunk, 0, length);
SnapshotChunk snapshotChunk = new SnapshotChunk(i, totalChunks, chunk);
Request chunkRequest = new Request(generateRequestId(),
RequestType.SNAPSHOT_CHUNK, snapshotChunk);
channel.writeAndFlush(chunkRequest).sync();
if (i % 10 == 0 || i == totalChunks - 1) {
logger.debug("Sent snapshot chunk {}/{} to {}",
i + 1, totalChunks, serverId);
}
}
logger.info("Snapshot sent successfully to {}", serverId);
} catch (Exception e) {
throw new IOException("Error sending snapshot to " + serverId, e);
} finally {
MDC.remove("targetServerId");
}
}
// 获取连接到指定服务器的通道
private Channel getChannel(String serverId) throws IOException {
Channel channel = channels.get(serverId);
if (channel == null || !channel.isActive()) {
throw new IOException("No active connection to server: " + serverId);
}
return channel;
}
// 生成唯一请求ID
private static final AtomicLong requestIdGenerator = new AtomicLong(0);
private static Long generateRequestId() {
return requestIdGenerator.incrementAndGet();
}
// 关闭客户端
public void shutdown() {
// 关闭所有连接
for (Channel channel : channels.values()) {
channel.close();
}
channels.clear();
// 关闭事件循环组
workerGroup.shutdownGracefully();
}
// 请求类型
enum RequestType {
PROPOSAL, COMMIT, EPOCH, TRUNCATE, TRANSACTION, NEWLEADER, HEARTBEAT,
SNAPSHOT_META, SNAPSHOT_CHUNK
}
// 请求对象
static class Request {
private final Long id;
private final RequestType type;
private final Object payload;
public Request(Long id, RequestType type, Object payload) {
this.id = id;
this.type = type;
this.payload = payload;
}
public Long getId() {
return id;
}
public RequestType getType() {
return type;
}
public Object getPayload() {
return payload;
}
}
// 快照元数据
static class SnapshotMetadata {
private final long zxid;
private final int totalSize;
private final int totalChunks;
public SnapshotMetadata(long zxid, int totalSize, int totalChunks) {
this.zxid = zxid;
this.totalSize = totalSize;
this.totalChunks = totalChunks;
}
public long getZxid() {
return zxid;
}
public int getTotalSize() {
return totalSize;
}
public int getTotalChunks() {
return totalChunks;
}
}
// 快照数据块
static class SnapshotChunk {
private final int chunkIndex;
private final int totalChunks;
private final byte[] data;
public SnapshotChunk(int chunkIndex, int totalChunks, byte[] data) {
this.chunkIndex = chunkIndex;
this.totalChunks = totalChunks;
this.data = data.clone(); // 防御性复制
}
public int getChunkIndex() {
return chunkIndex;
}
public int getTotalChunks() {
return totalChunks;
}
public byte[] getData() {
return data.clone(); // 防御性复制
}
}
// 请求-响应映射注册表
static class RequestRegistry {
private static final ConcurrentMap<Long, RequestFuture<?>> futures = new ConcurrentHashMap<>();
public static <T> void register(Long requestId, RequestFuture<T> future) {
futures.put(requestId, future);
}
@SuppressWarnings("unchecked")
public static <T> void complete(Long requestId, T response) {
RequestFuture<T> future = (RequestFuture<T>) futures.remove(requestId);
if (future != null) {
future.complete(response);
}
}
public static void completeExceptionally(Long requestId, Throwable exception) {
RequestFuture<?> future = futures.remove(requestId);
if (future != null) {
future.completeExceptionally(exception);
}
}
}
// 请求Future
static class RequestFuture<T> extends CompletableFuture<T> {
// 继承CompletableFuture,无需额外实现
}
// 客户端处理器
private class ClientHandler extends SimpleChannelInboundHandler<Response> {
@Override
protected void channelRead0(ChannelHandlerContext ctx, Response response) {
Long requestId = response.getRequestId();
if (response.isSuccess()) {
RequestRegistry.complete(requestId, response.getPayload());
} else {
RequestRegistry.completeExceptionally(requestId,
new IOException("Request failed: " + response.getErrorMessage()));
}
}
@Override
public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) {
logger.error("Network client exception", cause);
ctx.close();
}
}
// 响应对象
static class Response {
private final Long requestId;
private final boolean success;
private final Object payload;
private final String errorMessage;
public Response(Long requestId, boolean success, Object payload, String errorMessage) {
this.requestId = requestId;
this.success = success;
this.payload = payload;
this.errorMessage = errorMessage;
}
public Long getRequestId() {
return requestId;
}
public boolean isSuccess() {
return success;
}
public Object getPayload() {
return payload;
}
public String getErrorMessage() {
return errorMessage;
}
}
// 编码器
static class PacketEncoder extends MessageToByteEncoder<Request> {
@Override
protected void encode(ChannelHandlerContext ctx, Request msg, ByteBuf out) throws Exception {
// 使用协议缓冲区或自定义序列化
// 这里简化为示例
byte[] bytes = serializeRequest(msg);
out.writeBytes(bytes);
}
private byte[] serializeRequest(Request request) {
// 实际实现应使用正式的序列化机制
// 这里简化为示例
return new byte[0];
}
}
// 解码器
static class PacketDecoder extends ByteToMessageDecoder {
@Override
protected void decode(ChannelHandlerContext ctx, ByteBuf in, List<Object> out) throws Exception {
// 使用协议缓冲区或自定义反序列化
// 这里简化为示例
if (in.readableBytes() >= 4) { // 至少包含长度字段
in.markReaderIndex();
int length = in.readInt();
if (in.readableBytes() < length) {
in.resetReaderIndex();
return;
}
byte[] data = new byte[length];
in.readBytes(data);
Response response = deserializeResponse(data);
out.add(response);
}
}
private Response deserializeResponse(byte[] data) {
// 实际实现应使用正式的反序列化机制
// 这里简化为示例
return null;
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
- 350.
- 351.
- 352.
- 353.
- 354.
- 355.
- 356.
- 357.
- 358.
- 359.
- 360.
- 361.
- 362.
- 363.
- 364.
- 365.
- 366.
- 367.
- 368.
- 369.
- 370.
- 371.
- 372.
- 373.
- 374.
- 375.
- 376.
- 377.
- 378.
- 379.
- 380.
- 381.
- 382.
- 383.
- 384.
- 385.
- 386.
- 387.
- 388.
- 389.
- 390.
- 391.
- 392.
- 393.
- 394.
- 395.
- 396.
- 397.
- 398.
- 399.
- 400.
- 401.
- 402.
- 403.
- 404.
- 405.
- 406.
- 407.
- 408.
- 409.
- 410.
- 411.
- 412.
- 413.
- 414.
- 415.
- 416.
- 417.
- 418.
- 419.
- 420.
- 421.
- 422.
- 423.
- 424.
- 425.
- 426.
- 427.
- 428.
- 429.
- 430.
- 431.
- 432.
- 433.
- 434.
三、Paxos 算法实现
核心接口定义
// 角色接口定义
public interface Proposer {
CompletableFuture<Boolean> prepare(int ballot);
CompletableFuture<Boolean> propose(int ballot, Object value);
}
public interface Acceptor {
CompletableFuture<Promise> handlePrepare(int ballot);
CompletableFuture<Accepted> handleAccept(int ballot, Object value);
}
public interface Learner {
void learn(long instanceId, int ballot, Object value);
}
public interface NetworkClient {
CompletableFuture<Promise> sendPrepare(int nodeId, int ballot);
CompletableFuture<Accepted> sendAccept(int nodeId, int ballot, Object value);
void sendLearn(int nodeId, long instanceId, int ballot, Object value);
CompletableFuture<Map<Long, PrepareResponse>> sendPrepareAllInstances(int nodeId, int ballot);
CompletableFuture<Void> sendSnapshot(int nodeId, byte[] snapshot, long lastInstanceId);
}
public interface StateMachine {
CompletableFuture<Void> apply(long instanceId, byte[] command);
long getLastApplied();
CompletableFuture<byte[]> takeSnapshot();
CompletableFuture<Void> restoreSnapshot(byte[] snapshot, long instanceId);
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
Basic Paxos 实现
public class BasicPaxosNode implements Proposer, Acceptor, Learner, AutoCloseable {
private final int nodeId;
private final AtomicInteger ballot = new AtomicInteger(0);
private volatile Object proposalValue = null;
private final ReadWriteLock rwLock = new ReentrantReadWriteLock();
private volatile int acceptedBallot = 0;
private volatile Object acceptedValue = null;
private final int totalNodes;
private final NetworkClient networkClient;
private final Logger logger = LoggerFactory.getLogger(BasicPaxosNode.class);
private final RetryStrategy retryStrategy;
private final MetricsCollector metrics;
public BasicPaxosNode(int nodeId, int totalNodes, NetworkClient networkClient) {
this.nodeId = nodeId;
this.totalNodes = totalNodes;
this.networkClient = networkClient;
this.retryStrategy = new ExponentialBackoffRetry(100, 5000, 3);
this.metrics = new MetricsCollector("paxos_basic", nodeId);
}
// Proposer: 准备阶段
@Override
public CompletableFuture<Boolean> prepare(int suggestedBallot) {
final int newBallot = suggestedBallot > 0 ? suggestedBallot : generateNewBallot();
final Stopwatch stopwatch = Stopwatch.createStarted();
MDC.put("component", "paxos-proposer");
MDC.put("nodeId", String.valueOf(nodeId));
MDC.put("ballot", String.valueOf(newBallot));
logger.info("Starting prepare phase with ballot {}", newBallot);
CompletableFuture<Boolean> result = new CompletableFuture<>();
CompletableFuture.runAsync(() -> {
try {
// 向所有Acceptor发送Prepare请求
List<CompletableFuture<Promise>> futures = sendPrepare(newBallot);
// 收集结果
List<Promise> promises = new ArrayList<>();
for (CompletableFuture<Promise> future : futures) {
try {
Promise promise = future.get(3, TimeUnit.SECONDS);
if (promise != null) {
promises.add(promise);
}
} catch (Exception e) {
logger.warn("Error getting prepare response", e);
}
}
// 如果获得多数派响应
int quorum = getQuorum();
int okCount = (int) promises.stream().filter(Promise::isOk).count();
if (okCount >= quorum) {
// 更新ballot
ballot.updateAndGet(current -> Math.max(current, newBallot));
// 选择已接受的最高编号提案的值
Promise highestPromise = selectHighestBallotPromise(promises);
rwLock.writeLock().lock();
try {
if (highestPromise != null && highestPromise.getAcceptedValue() != null) {
proposalValue = highestPromise.getAcceptedValue();
logger.info("Using previously accepted value: {}", proposalValue);
}
} finally {
rwLock.writeLock().unlock();
}
metrics.recordPrepareSuccess(stopwatch.elapsed(TimeUnit.MILLISECONDS));
result.complete(true);
} else {
logger.info("Failed to get quorum in prepare phase: {} of {} responses ok",
okCount, promises.size());
metrics.recordPrepareFailed();
result.complete(false);
}
} catch (Exception e) {
logger.error("Error in prepare phase", e);
metrics.recordPrepareFailed();
result.completeExceptionally(e);
} finally {
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("ballot");
}
});
return result;
}
// Proposer: 接受阶段
@Override
public CompletableFuture<Boolean> propose(int ballot, Object value) {
final Stopwatch stopwatch = Stopwatch.createStarted();
MDC.put("component", "paxos-proposer");
MDC.put("nodeId", String.valueOf(nodeId));
MDC.put("ballot", String.valueOf(ballot));
return prepare(ballot).thenCompose(prepared -> {
if (!prepared) {
logger.info("Prepare phase failed, cannot proceed to propose");
metrics.recordProposeFailed();
return CompletableFuture.completedFuture(false);
}
// 获取当前要提议的值
final Object valueToPropose;
rwLock.readLock().lock();
try {
// 如果准备阶段没有发现已接受的值,使用提议者的值
valueToPropose = proposalValue != null ? proposalValue : value;
logger.info("Starting accept phase with ballot {} and value {}",
ballot, valueToPropose);
} finally {
rwLock.readLock().unlock();
}
return CompletableFuture.supplyAsync(() -> {
try {
// 向所有Acceptor发送Accept请求
List<CompletableFuture<Accepted>> futures = sendAccept(ballot, valueToPropose);
// 收集结果
List<Accepted> responses = new ArrayList<>();
for (CompletableFuture<Accepted> future : futures) {
try {
Accepted accepted = future.get(3, TimeUnit.SECONDS);
if (accepted != null) {
responses.add(accepted);
}
} catch (Exception e) {
logger.warn("Error getting accept response", e);
}
}
// 如果获得多数派接受
int quorum = getQuorum();
int accepted = (int) responses.stream().filter(Accepted::isOk).count();
boolean success = accepted >= quorum;
if (success) {
logger.info("Value {} has been accepted by the majority ({} of {})",
valueToPropose, accepted, responses.size());
// 通知所有Learner
broadcastToLearners(1, ballot, valueToPropose);
metrics.recordProposeSuccess(stopwatch.elapsed(TimeUnit.MILLISECONDS));
} else {
logger.info("Failed to get quorum in accept phase: {} of {} responses ok",
accepted, responses.size());
metrics.recordProposeFailed();
}
return success;
} catch (Exception e) {
logger.error("Error in propose phase", e);
metrics.recordProposeFailed();
throw new CompletionException(e);
} finally {
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("ballot");
}
});
}).exceptionally(e -> {
logger.error("Failed to propose value", e);
metrics.recordProposeFailed();
return false;
});
}
// Acceptor: 处理Prepare请求
@Override
public CompletableFuture<Promise> handlePrepare(int proposalBallot) {
MDC.put("component", "paxos-acceptor");
MDC.put("nodeId", String.valueOf(nodeId));
MDC.put("ballot", String.valueOf(proposalBallot));
return CompletableFuture.supplyAsync(() -> {
Promise promise = new Promise();
rwLock.writeLock().lock();
try {
if (proposalBallot > acceptedBallot) {
// 承诺不再接受编号小于等于proposalBallot的提案
acceptedBallot = proposalBallot;
promise.setOk(true);
promise.setAcceptedBallot(this.acceptedBallot);
promise.setAcceptedValue(this.acceptedValue);
logger.info("Acceptor {} promised ballot {}", nodeId, proposalBallot);
metrics.recordPromiseMade();
} else {
promise.setOk(false);
logger.info("Acceptor {} rejected ballot {}, current ballot: {}",
nodeId, proposalBallot, acceptedBallot);
metrics.recordPromiseRejected();
}
return promise;
} finally {
rwLock.writeLock().unlock();
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("ballot");
}
});
}
// Acceptor: 处理Accept请求
@Override
public CompletableFuture<Accepted> handleAccept(int proposalBallot, Object proposalValue) {
MDC.put("component", "paxos-acceptor");
MDC.put("nodeId", String.valueOf(nodeId));
MDC.put("ballot", String.valueOf(proposalBallot));
return CompletableFuture.supplyAsync(() -> {
Accepted accepted = new Accepted();
rwLock.writeLock().lock();
try {
if (proposalBallot >= acceptedBallot) {
acceptedBallot = proposalBallot;
acceptedValue = proposalValue;
accepted.setOk(true);
logger.info("Acceptor {} accepted ballot {} with value {}",
nodeId, proposalBallot, proposalValue);
metrics.recordAcceptMade();
} else {
accepted.setOk(false);
logger.info("Acceptor {} rejected accept for ballot {}, current ballot: {}",
nodeId, proposalBallot, acceptedBallot);
metrics.recordAcceptRejected();
}
return accepted;
} finally {
rwLock.writeLock().unlock();
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("ballot");
}
});
}
// Learner: 学习已决议的值
@Override
public void learn(long instanceId, int ballot, Object value) {
MDC.put("component", "paxos-learner");
MDC.put("nodeId", String.valueOf(nodeId));
MDC.put("instanceId", String.valueOf(instanceId));
MDC.put("ballot", String.valueOf(ballot));
try {
logger.info("Learner {} learned value {} for instance {} with ballot {}",
nodeId, value, instanceId, ballot);
metrics.recordLearnReceived();
// 实际实现中,这里会将学习到的值应用到状态机
// applyToStateMachine(instanceId, value);
} finally {
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("instanceId");
MDC.remove("ballot");
}
}
// 发送Prepare请求给所有Acceptor
private List<CompletableFuture<Promise>> sendPrepare(int newBallot) {
List<CompletableFuture<Promise>> futures = new ArrayList<>();
for (int i = 0; i < totalNodes; i++) {
final int targetNodeId = i;
if (targetNodeId == this.nodeId) {
// 处理本地请求
futures.add(handlePrepare(newBallot));
} else {
// 发送远程请求
futures.add(networkClient.sendPrepare(targetNodeId, newBallot)
.exceptionally(e -> {
logger.error("Failed to send prepare to node {}", targetNodeId, e);
return null;
}));
}
}
return futures;
}
// 发送Accept请求给所有Acceptor
private List<CompletableFuture<Accepted>> sendAccept(int ballot, Object value) {
List<CompletableFuture<Accepted>> futures = new ArrayList<>();
for (int i = 0; i < totalNodes; i++) {
final int targetNodeId = i;
if (targetNodeId == this.nodeId) {
// 处理本地请求
futures.add(handleAccept(ballot, value));
} else {
// 发送远程请求
futures.add(networkClient.sendAccept(targetNodeId, ballot, value)
.exceptionally(e -> {
logger.error("Failed to send accept to node {}", targetNodeId, e);
return null;
}));
}
}
return futures;
}
// 通知所有Learner已决议的值
private void broadcastToLearners(long instanceId, int ballot, Object value) {
for (int i = 0; i < totalNodes; i++) {
final int targetNodeId = i;
if (targetNodeId == this.nodeId) {
// 本地学习
learn(instanceId, ballot, value);
} else {
// 异步通知其他Learner
CompletableFuture.runAsync(() -> {
try {
networkClient.sendLearn(targetNodeId, instanceId, ballot, value);
} catch (Exception e) {
logger.error("Failed to notify learner {}", targetNodeId, e);
}
});
}
}
}
// 选择最高ballot的Promise
private Promise selectHighestBallotPromise(List<Promise> promises) {
return promises.stream()
.filter(p -> p.isOk() && p.getAcceptedValue() != null)
.max(Comparator.comparingInt(Promise::getAcceptedBallot))
.orElse(null);
}
// 生成比当前更大的提案编号 (加入节点ID保证唯一性)
private int generateNewBallot() {
// 确保新ballot大于之前的,并且保证不同节点的ballot唯一
return ballot.incrementAndGet() * totalNodes + nodeId;
}
// 获取多数派数量
private int getQuorum() {
return totalNodes / 2 + 1;
}
@Override
public void close() {
// 释放资源
metrics.close();
}
// Promise类
public static class Promise {
private boolean ok;
private int acceptedBallot;
private Object acceptedValue;
public boolean isOk() {
return ok;
}
public void setOk(boolean ok) {
this.ok = ok;
}
public int getAcceptedBallot() {
return acceptedBallot;
}
public void setAcceptedBallot(int acceptedBallot) {
this.acceptedBallot = acceptedBallot;
}
public Object getAcceptedValue() {
return acceptedValue;
}
public void setAcceptedValue(Object acceptedValue) {
this.acceptedValue = acceptedValue;
}
}
// Accepted类
public static class Accepted {
private boolean ok;
public boolean isOk() {
return ok;
}
public void setOk(boolean ok) {
this.ok = ok;
}
}
// 指标收集类
private static class MetricsCollector implements AutoCloseable {
// 指标定义...
public MetricsCollector(String prefix, int nodeId) {
// 初始化指标...
}
public void recordPrepareSuccess(long latencyMs) {
// 记录准备阶段成功
}
public void recordPrepareFailed() {
// 记录准备阶段失败
}
public void recordProposeSuccess(long latencyMs) {
// 记录提议阶段成功
}
public void recordProposeFailed() {
// 记录提议阶段失败
}
public void recordPromiseMade() {
// 记录承诺次数
}
public void recordPromiseRejected() {
// 记录拒绝承诺次数
}
public void recordAcceptMade() {
// 记录接受次数
}
public void recordAcceptRejected() {
// 记录拒绝接受次数
}
public void recordLearnReceived() {
// 记录学习次数
}
@Override
public void close() {
// 清理资源
}
}
// 异常处理与重试策略
interface RetryStrategy {
<T> CompletableFuture<T> retry(Supplier<CompletableFuture<T>> action);
}
// 指数退避重试策略
static class ExponentialBackoffRetry implements RetryStrategy {
private final long initialBackoffMs;
private final long maxBackoffMs;
private final int maxRetries;
private final Logger logger = LoggerFactory.getLogger(ExponentialBackoffRetry.class);
public ExponentialBackoffRetry(long initialBackoffMs, long maxBackoffMs, int maxRetries) {
this.initialBackoffMs = initialBackoffMs;
this.maxBackoffMs = maxBackoffMs;
this.maxRetries = maxRetries;
}
@Override
public <T> CompletableFuture<T> retry(Supplier<CompletableFuture<T>> action) {
return retryInternal(action, 0);
}
private <T> CompletableFuture<T> retryInternal(Supplier<CompletableFuture<T>> action,
int attempt) {
return action.get().exceptionally(e -> {
if (attempt >= maxRetries) {
throw new CompletionException(
new RetryExhaustedException("Max retries exceeded", e));
}
long backoff = Math.min(initialBackoffMs * (long)Math.pow(2, attempt), maxBackoffMs);
backoff += ThreadLocalRandom.current().nextInt((int)(backoff / 5));
logger.info("Retry attempt {} after {}ms due to: {}",
attempt + 1, backoff, e.getMessage());
return CompletableFuture.delayedExecutor(backoff, TimeUnit.MILLISECONDS)
.execute(() -> retryInternal(action, attempt + 1))
.join();
});
}
}
// 自定义异常类
public static class RetryExhaustedException extends RuntimeException {
public RetryExhaustedException(String message, Throwable cause) {
super(message, cause);
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
- 350.
- 351.
- 352.
- 353.
- 354.
- 355.
- 356.
- 357.
- 358.
- 359.
- 360.
- 361.
- 362.
- 363.
- 364.
- 365.
- 366.
- 367.
- 368.
- 369.
- 370.
- 371.
- 372.
- 373.
- 374.
- 375.
- 376.
- 377.
- 378.
- 379.
- 380.
- 381.
- 382.
- 383.
- 384.
- 385.
- 386.
- 387.
- 388.
- 389.
- 390.
- 391.
- 392.
- 393.
- 394.
- 395.
- 396.
- 397.
- 398.
- 399.
- 400.
- 401.
- 402.
- 403.
- 404.
- 405.
- 406.
- 407.
- 408.
- 409.
- 410.
- 411.
- 412.
- 413.
- 414.
- 415.
- 416.
- 417.
- 418.
- 419.
- 420.
- 421.
- 422.
- 423.
- 424.
- 425.
- 426.
- 427.
- 428.
- 429.
- 430.
- 431.
- 432.
- 433.
- 434.
- 435.
- 436.
- 437.
- 438.
- 439.
- 440.
- 441.
- 442.
- 443.
- 444.
- 445.
- 446.
- 447.
- 448.
- 449.
- 450.
- 451.
- 452.
- 453.
- 454.
- 455.
- 456.
- 457.
- 458.
- 459.
- 460.
- 461.
- 462.
- 463.
- 464.
- 465.
- 466.
- 467.
- 468.
- 469.
- 470.
- 471.
- 472.
- 473.
- 474.
- 475.
- 476.
- 477.
- 478.
- 479.
- 480.
- 481.
- 482.
- 483.
- 484.
- 485.
- 486.
- 487.
- 488.
- 489.
- 490.
- 491.
- 492.
- 493.
- 494.
- 495.
- 496.
- 497.
- 498.
- 499.
- 500.
- 501.
- 502.
- 503.
Multi-Paxos 实现
下面实现了 Multi-Paxos 的组件化架构,通过分离关注点提高代码的可维护性:
public class MultiPaxosSystem {
private final int nodeId;
private final Configuration config;
private final MultiPaxosLog log;
private final MultiPaxosStateMachine stateMachine;
private final MultiPaxosNetworking networking;
private final RoleManager roleManager;
private final ScheduledExecutorService scheduler;
private final Logger logger = LoggerFactory.getLogger(MultiPaxosSystem.class);
public MultiPaxosSystem(int nodeId, Configuration config) {
this.nodeId = nodeId;
this.config = config;
this.log = new MultiPaxosLog();
this.stateMachine = new MultiPaxosStateMachine();
this.networking = new MultiPaxosNetworking(nodeId, config.getNodes());
this.roleManager = new RoleManager(this);
this.scheduler = Executors.newScheduledThreadPool(2, r -> {
Thread t = new Thread(r, "multi-paxos-scheduler-" + nodeId);
t.setDaemon(true);
return t;
});
// 启动日志应用线程
scheduler.scheduleWithFixedDelay(this::applyCommittedLogs, 100, 100, TimeUnit.MILLISECONDS);
// 启动Leader租约检查
scheduler.scheduleWithFixedDelay(this::checkLeaderLease, 1000, 1000, TimeUnit.MILLISECONDS);
}
// 客户端API
// 追加新日志(写操作)
public CompletableFuture<Boolean> appendLog(byte[] command) {
if (!roleManager.isLeader()) {
return CompletableFuture.failedFuture(
new NotLeaderException("Not the leader", roleManager.getLeaderHint()));
}
return roleManager.getLeaderRole().appendLog(command);
}
// 读取操作
public CompletableFuture<byte[]> read(String key, ConsistencyLevel level) {
switch (level) {
case LINEARIZABLE:
if (!roleManager.isLeader()) {
return CompletableFuture.failedFuture(
new NotLeaderException("Not the leader", roleManager.getLeaderHint()));
}
return roleManager.getLeaderRole().linearizableRead(key);
case SEQUENTIAL:
return roleManager.getFollowerRole().sequentialRead(key);
case EVENTUAL:
default:
return roleManager.getFollowerRole().eventualRead(key);
}
}
// 尝试成为Leader
public CompletableFuture<Boolean> electSelf() {
return roleManager.electSelf();
}
// 日志应用
private void applyCommittedLogs() {
try {
long applied = stateMachine.getLastApplied();
long toApply = log.getCommitIndex();
if (applied >= toApply) {
return; // 已全部应用
}
List<CompletableFuture<Void>> applyFutures = new ArrayList<>();
// 应用从applied+1到toApply的所有日志
for (long i = applied + 1; i <= toApply; i++) {
final long instanceId = i;
LogEntry entry = log.getEntry(instanceId);
if (entry != null && entry.isCommitted()) {
applyFutures.add(
stateMachine.apply(instanceId, entry.getCommand())
.thenRun(() -> {
logger.debug("Applied log entry at instance {} to state machine",
instanceId);
})
.exceptionally(e -> {
logger.error("Failed to apply log at instance {}", instanceId, e);
return null;
})
);
}
}
// 等待所有应用完成
CompletableFuture.allOf(applyFutures.toArray(new CompletableFuture[0]))
.thenRun(() -> {
// 日志压缩
if (toApply - applied > 1000) { // 如果应用了大量日志,考虑压缩
log.compactLogs(stateMachine.getLastApplied());
}
})
.exceptionally(e -> {
logger.error("Error during log application", e);
return null;
});
} catch (Exception e) {
logger.error("Error applying committed logs", e);
}
}
// 检查Leader租约
private void checkLeaderLease() {
if (roleManager.isLeader()) {
roleManager.getLeaderRole().checkLease();
}
}
// 关闭系统
public void shutdown() {
try {
List<Runnable> pendingTasks = scheduler.shutdownNow();
if (!pendingTasks.isEmpty()) {
logger.warn("Scheduler shutdown with {} pending tasks", pendingTasks.size());
}
if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) {
logger.warn("Scheduler did not terminate in time");
}
networking.close();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while waiting for scheduler termination");
}
}
// 角色管理
public class RoleManager {
private final MultiPaxosSystem system;
private final AtomicBoolean isLeader = new AtomicBoolean(false);
private final AtomicInteger currentBallot = new AtomicInteger(0);
private volatile int leaderNodeId = -1; // -1表示未知
private final LeaderRole leaderRole;
private final FollowerRole followerRole;
public RoleManager(MultiPaxosSystem system) {
this.system = system;
this.leaderRole = new LeaderRole(system);
this.followerRole = new FollowerRole(system);
}
public boolean isLeader() {
return isLeader.get();
}
public int getLeaderHint() {
return leaderNodeId;
}
public LeaderRole getLeaderRole() {
return leaderRole;
}
public FollowerRole getFollowerRole() {
return followerRole;
}
public int getCurrentBallot() {
return currentBallot.get();
}
public void setCurrentBallot(int ballot) {
currentBallot.set(ballot);
}
public CompletableFuture<Boolean> electSelf() {
return leaderRole.electSelf().thenApply(elected -> {
if (elected) {
isLeader.set(true);
leaderNodeId = nodeId;
}
return elected;
});
}
public void stepDown() {
if (isLeader.compareAndSet(true, false)) {
logger.info("Node {} stepping down from leader", nodeId);
}
}
public void recognizeLeader(int leaderId, int ballot) {
leaderNodeId = leaderId;
currentBallot.set(ballot);
if (leaderId != nodeId) {
isLeader.set(false);
}
}
}
// Leader角色实现
public class LeaderRole {
private final MultiPaxosSystem system;
private final AtomicLong leaseExpirationTime = new AtomicLong(0);
private final long leaderLeaseMs = 5000; // 5秒租约
public LeaderRole(MultiPaxosSystem system) {
this.system = system;
}
// Leader选举
public CompletableFuture<Boolean> electSelf() {
MDC.put("component", "multi-paxos-leader");
MDC.put("nodeId", String.valueOf(nodeId));
logger.info("Node {} attempting to become leader", nodeId);
try {
int newBallot = generateNewBallot();
MDC.put("ballot", String.valueOf(newBallot));
return CompletableFuture.supplyAsync(() -> {
try {
// 执行Prepare阶段
Map<Long, PrepareResponse> responseMap = networking.sendPrepareForAllInstances(newBallot)
.get(10, TimeUnit.SECONDS);
// 检查是否获得多数派支持
if (hasQuorumPromises(responseMap)) {
// 根据收集到的信息,更新本地日志
updateLogFromPromises(responseMap);
// 成为Leader
system.roleManager.setCurrentBallot(newBallot);
system.roleManager.recognizeLeader(nodeId, newBallot);
logger.info("Node {} became leader with ballot {}", nodeId, newBallot);
renewLease();
// 执行接受阶段,确保之前的日志得到多数派接受
confirmPendingLogs();
return true;
} else {
logger.info("Failed to become leader - did not get quorum promises");
return false;
}
} catch (Exception e) {
logger.error("Error in become leader process", e);
return false;
} finally {
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("ballot");
}
});
} catch (Exception e) {
logger.error("Error initiating leader election", e);
MDC.remove("component");
MDC.remove("nodeId");
return CompletableFuture.failedFuture(e);
}
}
// Leader: 追加新日志
public CompletableFuture<Boolean> appendLog(byte[] command) {
Stopwatch stopwatch = Stopwatch.createStarted();
MDC.put("component", "multi-paxos-leader");
MDC.put("nodeId", String.valueOf(nodeId));
if (!system.roleManager.isLeader()) {
MDC.remove("component");
MDC.remove("nodeId");
return CompletableFuture.failedFuture(
new NotLeaderException("Node is not the leader", system.roleManager.getLeaderHint()));
}
try {
long nextInstance = system.log.getNextInstanceId();
MDC.put("instanceId", String.valueOf(nextInstance));
logger.info("Leader {} appending log at instance {}", nodeId, nextInstance);
// 创建日志条目
int currentBallot = system.roleManager.getCurrentBallot();
LogEntry entry = new LogEntry(currentBallot, command.clone()); // 防御性复制
// 存储日志条目
system.log.setEntry(nextInstance, entry);
// 对于已有Leader,可以跳过Prepare阶段,直接进入Accept阶段
return CompletableFuture.supplyAsync(() -> {
try {
List<AcceptResponse> responses = networking.sendAcceptRequests(
nextInstance, currentBallot, command)
.get(5, TimeUnit.SECONDS);
// 如果多数派接受
int quorum = getQuorum();
if (countAccepts(responses) >= quorum) {
// 提交日志
entry.setCommitted(true);
system.log.updateCommitIndex(nextInstance);
// 通知所有节点提交
networking.sendCommitNotifications(nextInstance, currentBallot);
logger.info("Log entry at instance {} has been committed", nextInstance);
return true;
} else {
logger.warn("Failed to get quorum for instance {}", nextInstance);
// 可能失去了领导权,尝试重新选举
system.roleManager.stepDown();
return false;
}
} catch (Exception e) {
logger.error("Error in append log", e);
throw new CompletionException(e);
} finally {
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("instanceId");
}
});
} catch (Exception e) {
logger.error("Error initiating append log", e);
MDC.remove("component");
MDC.remove("nodeId");
return CompletableFuture.failedFuture(e);
}
}
// 线性一致性读取(通过Leader确认)
public CompletableFuture<byte[]> linearizableRead(String key) {
if (!system.roleManager.isLeader()) {
return CompletableFuture.failedFuture(
new NotLeaderException("Not the leader", system.roleManager.getLeaderHint()));
}
// 检查租约
if (System.currentTimeMillis() >= leaseExpirationTime.get()) {
// 租约过期,需要重新确认Leader身份
return renewLease().thenCompose(renewed -> {
if (!renewed) {
return CompletableFuture.failedFuture(
new ConsistencyException("Could not renew leadership lease"));
}
return system.stateMachine.read(key);
});
}
// 租约有效,直接读取
return system.stateMachine.read(key);
}
// 更新Leader租约
private CompletableFuture<Boolean> renewLease() {
if (!system.roleManager.isLeader()) {
return CompletableFuture.completedFuture(false);
}
return CompletableFuture.supplyAsync(() -> {
try {
// 向多数派发送心跳以确认仍是Leader
int currentBallot = system.roleManager.getCurrentBallot();
int responses = networking.sendLeadershipHeartbeats(currentBallot)
.get(3, TimeUnit.SECONDS);
if (responses >= getQuorum()) {
leaseExpirationTime.set(System.currentTimeMillis() + leaderLeaseMs);
logger.debug("Renewed leader lease until {}", leaseExpirationTime.get());
return true;
} else {
logger.warn("Failed to renew leadership lease");
return false;
}
} catch (Exception e) {
logger.error("Error renewing leadership lease", e);
return false;
}
});
}
// 检查租约状态
public void checkLease() {
if (!system.roleManager.isLeader()) {
return;
}
// 如果租约即将过期,尝试续期
long now = System.currentTimeMillis();
long expiration = leaseExpirationTime.get();
// 如果租约将在1秒内过期,提前续期
if (now + 1000 > expiration) {
renewLease().thenAccept(renewed -> {
if (!renewed) {
logger.warn("Lease renewal failed, stepping down as leader");
system.roleManager.stepDown();
}
});
}
}
// 确保之前的日志条目被多数派接受
private void confirmPendingLogs() {
// 实现逻辑...
}
// 根据prepare响应更新日志
private void updateLogFromPromises(Map<Long, PrepareResponse> responseMap) {
// 实现逻辑...
}
// 检查是否获得多数派promise
private boolean hasQuorumPromises(Map<Long, PrepareResponse> responseMap) {
// 实现逻辑...
return true; // 简化
}
// 统计accept响应
private int countAccepts(List<AcceptResponse> responses) {
return (int) responses.stream()
.filter(r -> r != null && r.isAccepted())
.count();
}
}
// Follower角色实现
public class FollowerRole {
private final MultiPaxosSystem system;
private final Map<String, CacheEntry> readCache = new ConcurrentHashMap<>();
private final long maxCacheAgeMs = 5000; // 5秒缓存过期
public FollowerRole(MultiPaxosSystem system) {
this.system = system;
}
// 处理心跳消息
public void handleHeartbeat(int leaderBallot, int leaderNodeId, long leaderCommitIndex) {
// 更新本地commit index
system.log.updateCommitIndex(leaderCommitIndex);
// 如果自己认为自己是Leader但收到更高ballot的心跳,则退位
if (system.roleManager.isLeader() && leaderBallot > system.roleManager.getCurrentBallot()) {
logger.info("Stepping down as leader due to heartbeat with higher ballot: {}",
leaderBallot);
system.roleManager.stepDown();
}
// 记录当前Leader
system.roleManager.recognizeLeader(leaderNodeId, leaderBallot);
}
// 顺序一致性读(确保看到所有之前的写入)
public CompletableFuture<byte[]> sequentialRead(String key) {
// 确保应用了所有已提交的事务
return ensureAppliedUpToCommitIndex()
.thenCompose(v -> system.stateMachine.read(key));
}
// 最终一致性读(直接从本地读取)
public CompletableFuture<byte[]> eventualRead(String key) {
return system.stateMachine.read(key);
}
// 确保应用到当前commitIndex
private CompletableFuture<Void> ensureAppliedUpToCommitIndex() {
long current = system.log.getCommitIndex();
long applied = system.stateMachine.getLastApplied();
if (applied >= current) {
return CompletableFuture.completedFuture(null); // 已全部应用
}
// 等待应用完成
CompletableFuture<Void> result = new CompletableFuture<>();
scheduler.execute(() -> {
try {
// 触发应用
system.applyCommittedLogs();
// 检查是否应用完成
if (system.stateMachine.getLastApplied() >= current) {
result.complete(null);
} else {
// 可能有一些延迟,再次检查
scheduler.schedule(() -> {
system.applyCommittedLogs();
result.complete(null);
}, 50, TimeUnit.MILLISECONDS);
}
} catch (Exception e) {
result.completeExceptionally(e);
}
});
return result;
}
// 清理过期缓存
public void cleanupReadCache() {
long now = System.currentTimeMillis();
// 移除过期条目
readCache.entrySet().removeIf(entry ->
now - entry.getValue().getTimestamp() > maxCacheAgeMs);
// 如果缓存过大,移除最旧的条目
if (readCache.size() > config.getMaxCacheSize()) {
List<String> oldestKeys = readCache.entrySet().stream()
.sorted(Comparator.comparingLong(e -> e.getValue().getTimestamp()))
.limit(readCache.size() - config.getMaxCacheSize())
.map(Map.Entry::getKey)
.collect(Collectors.toList());
for (String key : oldestKeys) {
readCache.remove(key);
}
logger.info("Cache cleanup: removed {} old entries", oldestKeys.size());
}
}
}
// 内部组件类
// 日志管理
public static class MultiPaxosLog {
private final ReadWriteLock logLock = new ReentrantReadWriteLock();
private final ConcurrentNavigableMap<Long, LogEntry> log = new ConcurrentSkipListMap<>();
private final AtomicLong nextInstanceId = new AtomicLong(1);
private final AtomicLong commitIndex = new AtomicLong(0);
private final Logger logger = LoggerFactory.getLogger(MultiPaxosLog.class);
public LogEntry getEntry(long index) {
logLock.readLock().lock();
try {
return log.get(index);
} finally {
logLock.readLock().unlock();
}
}
public void setEntry(long index, LogEntry entry) {
logLock.writeLock().lock();
try {
log.put(index, entry);
nextInstanceId.updateAndGet(current -> Math.max(current, index + 1));
} finally {
logLock.writeLock().unlock();
}
}
public long getNextInstanceId() {
return nextInstanceId.getAndIncrement();
}
public long getCommitIndex() {
return commitIndex.get();
}
public void updateCommitIndex(long newCommitIndex) {
// 原子更新提交索引,确保只增不减
commitIndex.updateAndGet(current -> Math.max(current, newCommitIndex));
}
// 日志压缩
public void compactLogs(long appliedIndex) {
// 保留最近的日志,删除旧日志
final int retentionWindow = 1000; // 保留最近1000条
long truncatePoint = appliedIndex - retentionWindow;
if (truncatePoint <= 0) {
return; // 不需要压缩
}
logLock.writeLock().lock();
try {
List<Long> toRemove = log.keySet().stream()
.filter(idx -> idx < truncatePoint)
.collect(Collectors.toList());
for (Long idx : toRemove) {
log.remove(idx);
}
logger.info("Compacted {} log entries before index {}",
toRemove.size(), truncatePoint);
} finally {
logLock.writeLock().unlock();
}
}
}
// 状态机实现
public static class MultiPaxosStateMachine {
private final AtomicLong lastApplied = new AtomicLong(0);
private final Map<String, byte[]> keyValueStore = new ConcurrentHashMap<>();
private final Logger logger = LoggerFactory.getLogger(MultiPaxosStateMachine.class);
public CompletableFuture<Void> apply(long instanceId, byte[] command) {
return CompletableFuture.runAsync(() -> {
try {
// 解析命令
Command cmd = deserializeCommand(command);
// 应用到状态机
if (cmd.getType() == CommandType.PUT) {
keyValueStore.put(cmd.getKey(), cmd.getValue());
} else if (cmd.getType() == CommandType.DELETE) {
keyValueStore.remove(cmd.getKey());
}
// 更新已应用索引
lastApplied.updateAndGet(current -> Math.max(current, instanceId));
} catch (Exception e) {
logger.error("Error applying command at instance {}", instanceId, e);
throw new CompletionException(e);
}
});
}
public CompletableFuture<byte[]> read(String key) {
return CompletableFuture.supplyAsync(() -> {
byte[] value = keyValueStore.get(key);
return value != null ? value.clone() : null; // 防御性复制
});
}
public long getLastApplied() {
return lastApplied.get();
}
public CompletableFuture<byte[]> takeSnapshot() {
return CompletableFuture.supplyAsync(() -> {
try {
// 创建状态机快照
return serializeState();
} catch (Exception e) {
logger.error("Error taking snapshot", e);
throw new CompletionException(e);
}
});
}
public CompletableFuture<Void> restoreSnapshot(byte[] snapshot, long instanceId) {
return CompletableFuture.runAsync(() -> {
try {
// 从快照恢复状态
deserializeState(snapshot);
// 更新已应用索引
lastApplied.set(instanceId);
} catch (Exception e) {
logger.error("Error restoring snapshot", e);
throw new CompletionException(e);
}
});
}
// 序列化和反序列化辅助方法
private Command deserializeCommand(byte[] data) {
// 实际实现应使用正式的序列化机制
return new Command(CommandType.PUT, "key", data); // 简化示例
}
private byte[] serializeState() {
// 实际实现应使用正式的序列化机制
return new byte[0]; // 简化示例
}
private void deserializeState(byte[] data) {
// 实际实现应使用正式的序列化机制
// 简化示例
}
}
// 网络层
public static class MultiPaxosNetworking implements AutoCloseable {
private final int nodeId;
private final Map<Integer, NodeInfo> nodes;
private final NetworkClient client;
private final Logger logger = LoggerFactory.getLogger(MultiPaxosNetworking.class);
public MultiPaxosNetworking(int nodeId, Map<Integer, NodeInfo> nodes) {
this.nodeId = nodeId;
this.nodes = new HashMap<>(nodes);
this.client = createNetworkClient();
}
private NetworkClient createNetworkClient() {
// 实际实现应创建合适的网络客户端
return new NetworkClientImpl();
}
public CompletableFuture<Map<Long, PrepareResponse>> sendPrepareForAllInstances(int ballot) {
// 实现逻辑...
return CompletableFuture.completedFuture(new HashMap<>());
}
public CompletableFuture<List<AcceptResponse>> sendAcceptRequests(
long instanceId, int ballot, byte[] command) {
// 实现逻辑...
return CompletableFuture.completedFuture(new ArrayList<>());
}
public CompletableFuture<Integer> sendLeadershipHeartbeats(int ballot) {
// 实现逻辑...
return CompletableFuture.completedFuture(0);
}
public void sendCommitNotifications(long instanceId, int ballot) {
// 实现逻辑...
}
@Override
public void close() {
// 关闭网络客户端
}
}
// 生成新的ballot,确保全局唯一性
private int generateNewBallot() {
// 确保新ballot大于之前的,并且不同节点生成的ballot唯一
int currentBallot = roleManager.getCurrentBallot();
return (currentBallot / config.getTotalNodes() + 1) * config.getTotalNodes() + nodeId;
}
// 获取多数派数量
private int getQuorum() {
return config.getTotalNodes() / 2 + 1;
}
// 日志条目
public static class LogEntry {
private int ballot;
private final byte[] command;
private volatile boolean committed;
LogEntry(int ballot, byte[] command) {
this.ballot = ballot;
this.command = command.clone(); // 防御性复制
this.committed = false;
}
public int getBallot() {
return ballot;
}
public void setBallot(int ballot) {
this.ballot = ballot;
}
public byte[] getCommand() {
return command.clone(); // 防御性复制
}
public boolean isCommitted() {
return committed;
}
public void setCommitted(boolean committed) {
this.committed = committed;
}
}
// 配置类
public static class Configuration {
private final int totalNodes;
private final Map<Integer, NodeInfo> nodes;
private final int maxCacheSize;
public Configuration(int totalNodes, Map<Integer, NodeInfo> nodes, int maxCacheSize) {
this.totalNodes = totalNodes;
this.nodes = new HashMap<>(nodes);
this.maxCacheSize = maxCacheSize;
}
public int getTotalNodes() {
return totalNodes;
}
public Map<Integer, NodeInfo> getNodes() {
return Collections.unmodifiableMap(nodes);
}
public int getMaxCacheSize() {
return maxCacheSize;
}
}
// 节点信息
public static class NodeInfo {
private final int id;
private final String host;
private final int port;
public NodeInfo(int id, String host, int port) {
this.id = id;
this.host = host;
this.port = port;
}
public int getId() {
return id;
}
public String getHost() {
return host;
}
public int getPort() {
return port;
}
}
// 命令类型
enum CommandType {
PUT, DELETE
}
// 命令对象
static class Command {
private final CommandType type;
private final String key;
private final byte[] value;
public Command(CommandType type, String key, byte[] value) {
this.type = type;
this.key = key;
this.value = value != null ? value.clone() : null; // 防御性复制
}
public CommandType getType() {
return type;
}
public String getKey() {
return key;
}
public byte[] getValue() {
return value != null ? value.clone() : null; // 防御性复制
}
}
// 响应类
public static class PrepareResponse {
// 实现...
}
public static class AcceptResponse {
private final boolean accepted;
public AcceptResponse(boolean accepted) {
this.accepted = accepted;
}
public boolean isAccepted() {
return accepted;
}
}
// 一致性级别
public enum ConsistencyLevel {
LINEARIZABLE, // 线性一致性
SEQUENTIAL, // 顺序一致性
EVENTUAL // 最终一致性
}
// 异常类
public static class NotLeaderException extends RuntimeException {
private final int leaderHint;
public NotLeaderException(String message, int leaderHint) {
super(message);
this.leaderHint = leaderHint;
}
public int getLeaderHint() {
return leaderHint;
}
}
public static class ConsistencyException extends RuntimeException {
public ConsistencyException(String message) {
super(message);
}
}
// 简化的网络客户端实现
private static class NetworkClientImpl implements NetworkClient {
// 实现网络接口...
@Override
public CompletableFuture<Promise> sendPrepare(int nodeId, int ballot) {
return null;
}
@Override
public CompletableFuture<Accepted> sendAccept(int nodeId, int ballot, Object value) {
return null;
}
@Override
public void sendLearn(int nodeId, long instanceId, int ballot, Object value) {
}
@Override
public CompletableFuture<Map<Long, PrepareResponse>> sendPrepareAllInstances(int nodeId, int ballot) {
return null;
}
@Override
public CompletableFuture<Void> sendSnapshot(int nodeId, byte[] snapshot, long lastInstanceId) {
return null;
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
- 350.
- 351.
- 352.
- 353.
- 354.
- 355.
- 356.
- 357.
- 358.
- 359.
- 360.
- 361.
- 362.
- 363.
- 364.
- 365.
- 366.
- 367.
- 368.
- 369.
- 370.
- 371.
- 372.
- 373.
- 374.
- 375.
- 376.
- 377.
- 378.
- 379.
- 380.
- 381.
- 382.
- 383.
- 384.
- 385.
- 386.
- 387.
- 388.
- 389.
- 390.
- 391.
- 392.
- 393.
- 394.
- 395.
- 396.
- 397.
- 398.
- 399.
- 400.
- 401.
- 402.
- 403.
- 404.
- 405.
- 406.
- 407.
- 408.
- 409.
- 410.
- 411.
- 412.
- 413.
- 414.
- 415.
- 416.
- 417.
- 418.
- 419.
- 420.
- 421.
- 422.
- 423.
- 424.
- 425.
- 426.
- 427.
- 428.
- 429.
- 430.
- 431.
- 432.
- 433.
- 434.
- 435.
- 436.
- 437.
- 438.
- 439.
- 440.
- 441.
- 442.
- 443.
- 444.
- 445.
- 446.
- 447.
- 448.
- 449.
- 450.
- 451.
- 452.
- 453.
- 454.
- 455.
- 456.
- 457.
- 458.
- 459.
- 460.
- 461.
- 462.
- 463.
- 464.
- 465.
- 466.
- 467.
- 468.
- 469.
- 470.
- 471.
- 472.
- 473.
- 474.
- 475.
- 476.
- 477.
- 478.
- 479.
- 480.
- 481.
- 482.
- 483.
- 484.
- 485.
- 486.
- 487.
- 488.
- 489.
- 490.
- 491.
- 492.
- 493.
- 494.
- 495.
- 496.
- 497.
- 498.
- 499.
- 500.
- 501.
- 502.
- 503.
- 504.
- 505.
- 506.
- 507.
- 508.
- 509.
- 510.
- 511.
- 512.
- 513.
- 514.
- 515.
- 516.
- 517.
- 518.
- 519.
- 520.
- 521.
- 522.
- 523.
- 524.
- 525.
- 526.
- 527.
- 528.
- 529.
- 530.
- 531.
- 532.
- 533.
- 534.
- 535.
- 536.
- 537.
- 538.
- 539.
- 540.
- 541.
- 542.
- 543.
- 544.
- 545.
- 546.
- 547.
- 548.
- 549.
- 550.
- 551.
- 552.
- 553.
- 554.
- 555.
- 556.
- 557.
- 558.
- 559.
- 560.
- 561.
- 562.
- 563.
- 564.
- 565.
- 566.
- 567.
- 568.
- 569.
- 570.
- 571.
- 572.
- 573.
- 574.
- 575.
- 576.
- 577.
- 578.
- 579.
- 580.
- 581.
- 582.
- 583.
- 584.
- 585.
- 586.
- 587.
- 588.
- 589.
- 590.
- 591.
- 592.
- 593.
- 594.
- 595.
- 596.
- 597.
- 598.
- 599.
- 600.
- 601.
- 602.
- 603.
- 604.
- 605.
- 606.
- 607.
- 608.
- 609.
- 610.
- 611.
- 612.
- 613.
- 614.
- 615.
- 616.
- 617.
- 618.
- 619.
- 620.
- 621.
- 622.
- 623.
- 624.
- 625.
- 626.
- 627.
- 628.
- 629.
- 630.
- 631.
- 632.
- 633.
- 634.
- 635.
- 636.
- 637.
- 638.
- 639.
- 640.
- 641.
- 642.
- 643.
- 644.
- 645.
- 646.
- 647.
- 648.
- 649.
- 650.
- 651.
- 652.
- 653.
- 654.
- 655.
- 656.
- 657.
- 658.
- 659.
- 660.
- 661.
- 662.
- 663.
- 664.
- 665.
- 666.
- 667.
- 668.
- 669.
- 670.
- 671.
- 672.
- 673.
- 674.
- 675.
- 676.
- 677.
- 678.
- 679.
- 680.
- 681.
- 682.
- 683.
- 684.
- 685.
- 686.
- 687.
- 688.
- 689.
- 690.
- 691.
- 692.
- 693.
- 694.
- 695.
- 696.
- 697.
- 698.
- 699.
- 700.
- 701.
- 702.
- 703.
- 704.
- 705.
- 706.
- 707.
- 708.
- 709.
- 710.
- 711.
- 712.
- 713.
- 714.
- 715.
- 716.
- 717.
- 718.
- 719.
- 720.
- 721.
- 722.
- 723.
- 724.
- 725.
- 726.
- 727.
- 728.
- 729.
- 730.
- 731.
- 732.
- 733.
- 734.
- 735.
- 736.
- 737.
- 738.
- 739.
- 740.
- 741.
- 742.
- 743.
- 744.
- 745.
- 746.
- 747.
- 748.
- 749.
- 750.
- 751.
- 752.
- 753.
- 754.
- 755.
- 756.
- 757.
- 758.
- 759.
- 760.
- 761.
- 762.
- 763.
- 764.
- 765.
- 766.
- 767.
- 768.
- 769.
- 770.
- 771.
- 772.
- 773.
- 774.
- 775.
- 776.
- 777.
- 778.
- 779.
- 780.
- 781.
- 782.
- 783.
- 784.
- 785.
- 786.
- 787.
- 788.
- 789.
- 790.
- 791.
- 792.
- 793.
- 794.
- 795.
- 796.
- 797.
- 798.
- 799.
- 800.
- 801.
- 802.
- 803.
- 804.
- 805.
- 806.
- 807.
- 808.
- 809.
- 810.
- 811.
- 812.
- 813.
- 814.
- 815.
- 816.
- 817.
- 818.
- 819.
- 820.
- 821.
- 822.
- 823.
- 824.
- 825.
- 826.
- 827.
- 828.
- 829.
- 830.
- 831.
- 832.
- 833.
- 834.
- 835.
- 836.
- 837.
- 838.
- 839.
- 840.
- 841.
- 842.
- 843.
- 844.
- 845.
- 846.
- 847.
- 848.
- 849.
- 850.
- 851.
- 852.
- 853.
- 854.
- 855.
- 856.
- 857.
- 858.
- 859.
- 860.
- 861.
- 862.
- 863.
- 864.
- 865.
- 866.
- 867.
- 868.
- 869.
- 870.
- 871.
- 872.
- 873.
- 874.
- 875.
- 876.
- 877.
- 878.
- 879.
- 880.
- 881.
- 882.
- 883.
- 884.
- 885.
- 886.
- 887.
- 888.
- 889.
- 890.
- 891.
- 892.
- 893.
- 894.
- 895.
- 896.
- 897.
- 898.
- 899.
- 900.
- 901.
- 902.
- 903.
- 904.
- 905.
- 906.
- 907.
- 908.
- 909.
- 910.
- 911.
- 912.
- 913.
- 914.
- 915.
- 916.
- 917.
- 918.
- 919.
- 920.
- 921.
- 922.
- 923.
- 924.
四、网络分区处理与成员变更
网络分区检测
public class PartitionHandler implements AutoCloseable {
private final String nodeId;
private final AtomicLong lastHeartbeatTime = new AtomicLong(0);
private final AtomicBoolean suspectPartition = new AtomicBoolean(false);
private final ScheduledExecutorService scheduler;
private final long heartbeatTimeoutMs;
private final Consumer<PartitionEvent> partitionCallback;
private final Logger logger = LoggerFactory.getLogger(PartitionHandler.class);
public PartitionHandler(String nodeId, long heartbeatTimeoutMs,
Consumer<PartitionEvent> partitionCallback) {
this.nodeId = nodeId;
this.heartbeatTimeoutMs = heartbeatTimeoutMs;
this.partitionCallback = partitionCallback;
this.scheduler = Executors.newSingleThreadScheduledExecutor(r -> {
Thread t = new Thread(r, "partition-detector-" + nodeId);
t.setDaemon(true);
return t;
});
// 启动心跳检测任务
scheduler.scheduleAtFixedRate(
this::checkHeartbeat,
heartbeatTimeoutMs / 2,
heartbeatTimeoutMs / 2,
TimeUnit.MILLISECONDS
);
}
// 记录收到心跳
public void recordHeartbeat() {
lastHeartbeatTime.set(System.currentTimeMillis());
if (suspectPartition.compareAndSet(true, false)) {
logger.info("Node {} no longer suspects network partition", nodeId);
partitionCallback.accept(new PartitionEvent(PartitionStatus.RECOVERED, nodeId));
}
}
// 检查心跳超时
private void checkHeartbeat() {
try {
long now = System.currentTimeMillis();
long last = lastHeartbeatTime.get();
if (last > 0 && now - last > heartbeatTimeoutMs) {
// 可能存在网络分区
if (suspectPartition.compareAndSet(false, true)) {
logger.warn("Node {} suspects network partition, last heartbeat: {}ms ago",
nodeId, now - last);
// 执行分区检测回调
partitionCallback.accept(new PartitionEvent(PartitionStatus.SUSPECTED, nodeId));
}
}
} catch (Exception e) {
logger.error("Error checking heartbeat", e);
}
}
@Override
public void close() {
scheduler.shutdownNow();
try {
if (!scheduler.awaitTermination(1, TimeUnit.SECONDS)) {
logger.warn("Partition detector scheduler did not terminate in time");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while shutting down partition detector");
}
}
// 分区状态枚举
public enum PartitionStatus {
SUSPECTED, // 怀疑发生分区
CONFIRMED, // 确认发生分区
RECOVERED // 分区已恢复
}
// 分区事件类
public static class PartitionEvent {
private final PartitionStatus status;
private final String nodeId;
public PartitionEvent(PartitionStatus status, String nodeId) {
this.status = status;
this.nodeId = nodeId;
}
public PartitionStatus getStatus() {
return status;
}
public String getNodeId() {
return nodeId;
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
成员变更实现
public class MembershipManager implements AutoCloseable {
private final ConcurrentMap<String, ServerInfo> servers = new ConcurrentHashMap<>();
private volatile Configuration currentConfig;
private final AtomicLong configVersion = new AtomicLong(0);
private final String nodeId;
private final AtomicBoolean isLeader = new AtomicBoolean(false);
private final Logger logger = LoggerFactory.getLogger(MembershipManager.class);
private final ConfigurationStore configStore;
private final NetworkClient networkClient;
private final StampedLock configLock = new StampedLock();
public MembershipManager(String nodeId, boolean isLeader,
ConfigurationStore configStore,
NetworkClient networkClient) {
this.nodeId = nodeId;
this.isLeader.set(isLeader);
this.configStore = configStore;
this.networkClient = networkClient;
// 初始化配置
try {
this.currentConfig = configStore.loadConfiguration();
if (this.currentConfig == null) {
this.currentConfig = new Configuration(configVersion.get(), new HashMap<>());
}
servers.putAll(currentConfig.getServers());
} catch (IOException e) {
logger.error("Failed to load configuration", e);
this.currentConfig = new Configuration(configVersion.get(), new HashMap<>());
}
}
// 两阶段成员变更 - 安全添加节点
public CompletableFuture<Boolean> addServer(String serverId, String address, int port) {
MDC.put("component", "membership-manager");
MDC.put("nodeId", nodeId);
MDC.put("targetServerId", serverId);
if (!isLeader.get()) {
logger.warn("Only leader can change membership");
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("targetServerId");
return CompletableFuture.failedFuture(
new IllegalStateException("Only leader can change membership"));
}
CompletableFuture<Boolean> result = new CompletableFuture<>();
CompletableFuture.runAsync(() -> {
long stamp = configLock.writeLock();
try {
logger.info("Starting server addition: {}", serverId);
// 第一阶段:创建过渡配置(包含新旧所有节点)
Configuration oldConfig = currentConfig;
Configuration jointConfig = createJointConfig(oldConfig, serverId, address, port);
// 将过渡配置提交给集群
commitConfiguration(jointConfig).thenAccept(committed -> {
if (!committed) {
logger.warn("Failed to commit joint configuration for server {}", serverId);
result.complete(false);
return;
}
logger.info("Joint configuration committed, proceeding to second phase");
// 第二阶段:创建新配置(确认包含新节点)
Configuration newConfig = createNewConfig(jointConfig);
// 将新配置提交给集群
commitConfiguration(newConfig).thenAccept(finalCommitted -> {
if (finalCommitted) {
logger.info("Server {} successfully added to cluster", serverId);
} else {
logger.warn("Failed to commit final configuration for server {}", serverId);
}
result.complete(finalCommitted);
}).exceptionally(e -> {
logger.error("Error committing final configuration for server {}",
serverId, e);
result.completeExceptionally(e);
return null;
});
}).exceptionally(e -> {
logger.error("Error committing joint configuration for server {}",
serverId, e);
result.completeExceptionally(e);
return null;
});
} catch (Exception e) {
logger.error("Error adding server {}", serverId, e);
result.completeExceptionally(e);
} finally {
configLock.unlockWrite(stamp);
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("targetServerId");
}
});
return result;
}
// 两阶段成员变更 - 安全移除节点
public CompletableFuture<Boolean> removeServer(String serverId) {
MDC.put("component", "membership-manager");
MDC.put("nodeId", nodeId);
MDC.put("targetServerId", serverId);
if (!isLeader.get()) {
logger.warn("Only leader can change membership");
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("targetServerId");
return CompletableFuture.failedFuture(
new IllegalStateException("Only leader can change membership"));
}
if (!servers.containsKey(serverId)) {
logger.warn("Server {} not found in configuration", serverId);
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("targetServerId");
return CompletableFuture.completedFuture(false);
}
CompletableFuture<Boolean> result = new CompletableFuture<>();
CompletableFuture.runAsync(() -> {
long stamp = configLock.writeLock();
try {
logger.info("Starting server removal: {}", serverId);
// 第一阶段:创建过渡配置(标记要移除的节点)
Configuration oldConfig = currentConfig;
Configuration jointConfig = createJointConfig(oldConfig, serverId);
// 将过渡配置提交给集群
commitConfiguration(jointConfig).thenAccept(committed -> {
if (!committed) {
logger.warn("Failed to commit joint configuration for removing server {}",
serverId);
result.complete(false);
return;
}
logger.info("Joint configuration committed, proceeding to second phase");
// 第二阶段:创建新配置(移除目标节点)
Configuration newConfig = createNewConfigWithout(jointConfig, serverId);
// 将新配置提交给集群
commitConfiguration(newConfig).thenAccept(finalCommitted -> {
if (finalCommitted) {
logger.info("Server {} successfully removed from cluster", serverId);
} else {
logger.warn("Failed to commit final configuration for removing server {}",
serverId);
}
result.complete(finalCommitted);
}).exceptionally(e -> {
logger.error("Error committing final configuration for removing server {}",
serverId, e);
result.completeExceptionally(e);
return null;
});
}).exceptionally(e -> {
logger.error("Error committing joint configuration for removing server {}",
serverId, e);
result.completeExceptionally(e);
return null;
});
} catch (Exception e) {
logger.error("Error removing server {}", serverId, e);
result.completeExceptionally(e);
} finally {
configLock.unlockWrite(stamp);
MDC.remove("component");
MDC.remove("nodeId");
MDC.remove("targetServerId");
}
});
return result;
}
// 创建过渡配置(添加节点)
private Configuration createJointConfig(Configuration oldConfig,
String newServerId, String address, int port) {
Map<String, ServerInfo> newServers = new HashMap<>(oldConfig.getServers());
newServers.put(newServerId, new ServerInfo(newServerId, address, port));
return new Configuration(configVersion.incrementAndGet(), newServers);
}
// 创建过渡配置(删除节点)
private Configuration createJointConfig(Configuration oldConfig, String serverId) {
// 标记要删除的节点(在过渡配置中仍存在,但标记为待移除)
Map<String, ServerInfo> jointServers = new HashMap<>(oldConfig.getServers());
ServerInfo serverInfo = jointServers.get(serverId);
if (serverInfo != null) {
ServerInfo markedServer = new ServerInfo(
serverId, serverInfo.getAddress(), serverInfo.getPort(), true);
jointServers.put(serverId, markedServer);
}
return new Configuration(configVersion.incrementAndGet(), jointServers);
}
// 创建新配置(确认添加节点)
private Configuration createNewConfig(Configuration jointConfig) {
// 最终配置,清除所有标记
Map<String, ServerInfo> newServers = new HashMap<>();
for (var entry : jointConfig.getServers().entrySet()) {
if (!entry.getValue().isMarkedForRemoval()) {
newServers.put(entry.getKey(), new ServerInfo(
entry.getValue().getId(),
entry.getValue().getAddress(),
entry.getValue().getPort(),
false
));
}
}
return new Configuration(configVersion.incrementAndGet(), newServers);
}
// 创建新配置(确认删除节点)
private Configuration createNewConfigWithout(Configuration jointConfig, String serverId) {
Map<String, ServerInfo> newServers = new HashMap<>();
for (var entry : jointConfig.getServers().entrySet()) {
if (!entry.getKey().equals(serverId) && !entry.getValue().isMarkedForRemoval()) {
newServers.put(entry.getKey(), new ServerInfo(
entry.getValue().getId(),
entry.getValue().getAddress(),
entry.getValue().getPort(),
false
));
}
}
return new Configuration(configVersion.incrementAndGet(), newServers);
}
// 提交配置变更
private CompletableFuture<Boolean> commitConfiguration(Configuration config) {
return CompletableFuture.supplyAsync(() -> {
try {
// 实际实现会通过共识算法提交配置变更
logger.info("Committing configuration version {}", config.getVersion());
// 持久化配置
configStore.saveConfiguration(config);
// 更新本地配置
synchronized (this) {
currentConfig = config;
servers.clear();
servers.putAll(config.getServers());
}
// 广播配置变更
broadcastConfigChange(config);
return true;
} catch (Exception e) {
logger.error("Error committing configuration", e);
return false;
}
});
}
// 广播配置变更
private void broadcastConfigChange(Configuration config) {
// 向所有节点广播配置变更
for (String serverId : servers.keySet()) {
if (!serverId.equals(nodeId)) {
CompletableFuture.runAsync(() -> {
try {
// 实际实现中调用网络客户端发送配置
notifyConfigChange(serverId, config);
} catch (Exception e) {
logger.error("Failed to notify server {} of config change", serverId, e);
}
});
}
}
}
// 通知节点配置变更
private void notifyConfigChange(String serverId, Configuration config) {
// 实际实现会发送配置给指定节点
logger.debug("Notifying server {} of configuration change to version {}",
serverId, config.getVersion());
}
// 处理接收到的配置变更
public void handleConfigChange(Configuration newConfig) {
long stamp = configLock.writeLock();
try {
if (newConfig.getVersion() > currentConfig.getVersion()) {
try {
// 持久化新配置
configStore.saveConfiguration(newConfig);
// 更新本地配置
currentConfig = newConfig;
servers.clear();
servers.putAll(newConfig.getServers());
logger.info("Updated to new configuration version {}", newConfig.getVersion());
} catch (IOException e) {
logger.error("Failed to persist new configuration", e);
}
} else {
logger.debug("Ignoring old configuration version {} (current is {})",
newConfig.getVersion(), currentConfig.getVersion());
}
} finally {
configLock.unlockWrite(stamp);
}
}
// 获取当前配置
public Configuration getCurrentConfig() {
long stamp = configLock.tryOptimisticRead();
Configuration config = currentConfig;
if (!configLock.validate(stamp)) {
stamp = configLock.readLock();
try {
config = currentConfig;
} finally {
configLock.unlockRead(stamp);
}
}
return config;
}
// 检查节点是否在配置中(不包括标记为移除的节点)
public boolean isServerInConfig(String serverId) {
ServerInfo info = servers.get(serverId);
return info != null && !info.isMarkedForRemoval();
}
// 获取有效服务器数量(不包括标记为移除的节点)
public int getActiveServerCount() {
return (int) servers.values().stream()
.filter(s -> !s.isMarkedForRemoval())
.count();
}
// 设置Leader状态
public void setLeader(boolean isLeader) {
this.isLeader.set(isLeader);
}
@Override
public void close() {
// 释放资源
}
// 配置类
public static class Configuration implements Serializable {
private static final long serialVersionUID = 1L;
private final long version;
private final Map<String, ServerInfo> servers;
public Configuration(long version, Map<String, ServerInfo> servers) {
this.version = version;
this.servers = new HashMap<>(servers);
}
public long getVersion() {
return version;
}
public Map<String, ServerInfo> getServers() {
return Collections.unmodifiableMap(servers);
}
}
// 服务器信息
public static class ServerInfo implements Serializable {
private static final long serialVersionUID = 1L;
private final String id;
private final String address;
private final int port;
private final boolean markedForRemoval;
public ServerInfo(String id, String address, int port) {
this(id, address, port, false);
}
public ServerInfo(String id, String address, int port, boolean markedForRemoval) {
this.id = id;
this.address = address;
this.port = port;
this.markedForRemoval = markedForRemoval;
}
public String getId() {
return id;
}
public String getAddress() {
return address;
}
public int getPort() {
return port;
}
public boolean isMarkedForRemoval() {
return markedForRemoval;
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
- 350.
- 351.
- 352.
- 353.
- 354.
- 355.
- 356.
- 357.
- 358.
- 359.
- 360.
- 361.
- 362.
- 363.
- 364.
- 365.
- 366.
- 367.
- 368.
- 369.
- 370.
- 371.
- 372.
- 373.
- 374.
- 375.
- 376.
- 377.
- 378.
- 379.
- 380.
- 381.
- 382.
- 383.
- 384.
- 385.
- 386.
- 387.
- 388.
- 389.
- 390.
- 391.
- 392.
- 393.
- 394.
- 395.
- 396.
- 397.
- 398.
- 399.
- 400.
- 401.
- 402.
- 403.
- 404.
- 405.
- 406.
- 407.
- 408.
- 409.
- 410.
- 411.
- 412.
- 413.
- 414.
- 415.
- 416.
- 417.
- 418.
- 419.
- 420.
- 421.
- 422.
- 423.
配置存储实现
public class FileBasedConfigurationStore implements ConfigurationStore {
private final Path configPath;
private final Path snapshotDir;
private final Logger logger = LoggerFactory.getLogger(FileBasedConfigurationStore.class);
public FileBasedConfigurationStore(Path configPath, Path snapshotDir) {
this.configPath = configPath;
this.snapshotDir = snapshotDir;
try {
Files.createDirectories(configPath.getParent());
Files.createDirectories(snapshotDir);
} catch (IOException e) {
logger.error("Failed to create directories", e);
throw new UncheckedIOException("Failed to create directories", e);
}
}
@Override
public void saveConfiguration(MembershipManager.Configuration config) throws IOException {
// 使用原子写入保证一致性
Path tempPath = configPath.resolveSibling(configPath.getFileName() + ".tmp");
try (ObjectOutputStream oos = new ObjectOutputStream(
new BufferedOutputStream(Files.newOutputStream(tempPath)))) {
oos.writeObject(config);
oos.flush();
Files.move(tempPath, configPath, StandardCopyOption.ATOMIC_MOVE,
StandardCopyOption.REPLACE_EXISTING);
logger.info("Configuration version {} saved successfully", config.getVersion());
} catch (IOException e) {
logger.error("Failed to save configuration", e);
throw e;
}
}
@Override
public MembershipManager.Configuration loadConfiguration() throws IOException {
if (!Files.exists(configPath)) {
logger.info("Configuration file does not exist: {}", configPath);
return null;
}
try (ObjectInputStream ois = new ObjectInputStream(
new BufferedInputStream(Files.newInputStream(configPath)))) {
MembershipManager.Configuration config =
(MembershipManager.Configuration) ois.readObject();
logger.info("Loaded configuration version {}", config.getVersion());
return config;
} catch (ClassNotFoundException e) {
logger.error("Failed to deserialize configuration", e);
throw new IOException("Failed to deserialize configuration", e);
}
}
@Override
public void saveSnapshot(long index, byte[] data) throws IOException {
// 创建快照文件名,包含索引
String snapshotFileName = String.format("snapshot-%020d.bin", index);
Path snapshotPath = snapshotDir.resolve(snapshotFileName);
Path tempPath = snapshotDir.resolve(snapshotFileName + ".tmp");
try {
// 写入临时文件
Files.write(tempPath, data);
// 原子移动
Files.move(tempPath, snapshotPath, StandardCopyOption.ATOMIC_MOVE,
StandardCopyOption.REPLACE_EXISTING);
logger.info("Snapshot at index {} saved successfully, size: {} bytes",
index, data.length);
// 清理旧快照,保留最近的5个
cleanupOldSnapshots(5);
} catch (IOException e) {
logger.error("Failed to save snapshot at index {}", index, e);
throw e;
}
}
@Override
public SnapshotInfo loadLatestSnapshot() throws IOException {
try {
// 查找最新的快照文件
Optional<Path> latestSnapshot = Files.list(snapshotDir)
.filter(p -> p.getFileName().toString().startsWith("snapshot-") &&
p.getFileName().toString().endsWith(".bin"))
.max(Comparator.comparing(p -> p.getFileName().toString()));
if (latestSnapshot.isPresent()) {
Path snapshotPath = latestSnapshot.get();
String fileName = snapshotPath.getFileName().toString();
// 从文件名中提取索引
long index = Long.parseLong(fileName.substring(9, 29));
// 读取快照数据
byte[] data = Files.readAllBytes(snapshotPath);
logger.info("Loaded snapshot at index {}, size: {} bytes", index, data.length);
return new SnapshotInfo(index, data);
} else {
logger.info("No snapshot found in directory: {}", snapshotDir);
return null;
}
} catch (IOException e) {
logger.error("Failed to load latest snapshot", e);
throw e;
}
}
// 清理旧快照,只保留最新的n个
private void cleanupOldSnapshots(int keepCount) throws IOException {
try {
List<Path> snapshots = Files.list(snapshotDir)
.filter(p -> p.getFileName().toString().startsWith("snapshot-") &&
p.getFileName().toString().endsWith(".bin"))
.sorted(Comparator.comparing(p -> p.getFileName().toString()))
.collect(Collectors.toList());
// 如果快照数量超过保留数量,删除旧的
if (snapshots.size() > keepCount) {
int toDelete = snapshots.size() - keepCount;
for (int i = 0; i < toDelete; i++) {
Files.delete(snapshots.get(i));
logger.info("Deleted old snapshot: {}", snapshots.get(i).getFileName());
}
}
} catch (IOException e) {
logger.error("Failed to cleanup old snapshots", e);
throw e;
}
}
// 快照信息类
public static class SnapshotInfo {
private final long index;
private final byte[] data;
public SnapshotInfo(long index, byte[] data) {
this.index = index;
this.data = data.clone(); // 防御性复制
}
public long getIndex() {
return index;
}
public byte[] getData() {
return data.clone(); // 防御性复制
}
}
}
// 配置存储接口
public interface ConfigurationStore {
void saveConfiguration(MembershipManager.Configuration config) throws IOException;
MembershipManager.Configuration loadConfiguration() throws IOException;
void saveSnapshot(long index, byte[] data) throws IOException;
SnapshotInfo loadLatestSnapshot() throws IOException;
// 快照信息内部类定义
class SnapshotInfo {
private final long index;
private final byte[] data;
public SnapshotInfo(long index, byte[] data) {
this.index = index;
this.data = data.clone();
}
public long getIndex() {
return index;
}
public byte[] getData() {
return data.clone();
}
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
跨数据中心复制支持
public class CrossDCReplication implements AutoCloseable {
private final String localDC;
private final List<String> allDCs;
private final Map<String, DCConnection> dcConnections;
private final ConsensusSystem localSystem;
private final Logger logger = LoggerFactory.getLogger(CrossDCReplication.class);
private final ScheduledExecutorService scheduler;
private final AtomicLong replicationIndex = new AtomicLong(0);
private final ConcurrentMap<String, AtomicLong> dcReplicationProgress = new ConcurrentHashMap<>();
public CrossDCReplication(String localDC, List<String> allDCs,
ConsensusSystem localSystem,
Map<String, DCConnectionConfig> dcConfigs) {
this.localDC = localDC;
this.allDCs = new ArrayList<>(allDCs);
this.localSystem = localSystem;
this.dcConnections = new HashMap<>();
// 初始化数据中心连接
for (String dc : allDCs) {
if (!dc.equals(localDC)) {
DCConnectionConfig config = dcConfigs.get(dc);
if (config != null) {
dcConnections.put(dc, new DCConnection(dc, config));
dcReplicationProgress.put(dc, new AtomicLong(0));
}
}
}
this.scheduler = Executors.newScheduledThreadPool(2, r -> {
Thread t = new Thread(r, "dc-replication-scheduler");
t.setDaemon(true);
return t;
});
// 启动定期复制任务
scheduler.scheduleWithFixedDelay(
this::replicateChanges,
1000, 1000, TimeUnit.MILLISECONDS
);
// 启动健康检查任务
scheduler.scheduleWithFixedDelay(
this::checkDCHealth,
5000, 5000, TimeUnit.MILLISECONDS
);
}
// 复制请求到其他数据中心
public CompletableFuture<Boolean> replicateRequest(Request request) {
MDC.put("component", "cross-dc-replication");
MDC.put("requestId", request.getId());
try {
// 1. 首先在本地DC处理请求
return localSystem.processWrite(request)
.thenCompose(localSuccess -> {
if (!localSuccess) {
logger.warn("Request {} failed in local DC", request.getId());
return CompletableFuture.completedFuture(false);
}
// 2. 如果本地成功,更新复制索引
long index = replicationIndex.incrementAndGet();
// 3. 异步复制到其他数据中心
List<CompletableFuture<Boolean>> dcFutures = new ArrayList<>();
for (var entry : dcConnections.entrySet()) {
String dc = entry.getKey();
DCConnection connection = entry.getValue();
dcFutures.add(connection.replicateRequest(request, index)
.thenApply(success -> {
if (success) {
// 更新复制进度
dcReplicationProgress.get(dc).updateAndGet(
current -> Math.max(current, index));
logger.info("Request {} successfully replicated to DC {}",
request.getId(), dc);
} else {
logger.warn("Failed to replicate request {} to DC {}",
request.getId(), dc);
}
return success;
})
.exceptionally(e -> {
logger.error("Error replicating request {} to DC {}",
request.getId(), dc, e);
return false;
}));
}
// 4. 等待所有DC的响应,基于配置的复制策略
return handleDCReplications(dcFutures);
});
} finally {
MDC.remove("component");
MDC.remove("requestId");
}
}
// 根据复制策略处理跨DC复制结果
private CompletableFuture<Boolean> handleDCReplications(
List<CompletableFuture<Boolean>> dcFutures) {
ReplicationStrategy strategy = ReplicationStrategy.QUORUM; // 可配置
switch (strategy) {
case ALL:
// 所有DC都必须成功
return CompletableFuture.allOf(
dcFutures.toArray(new CompletableFuture[0]))
.thenApply(v -> dcFutures.stream()
.allMatch(f -> {
try {
return f.get();
} catch (Exception e) {
return false;
}
}));
case QUORUM:
// 多数DC必须成功
return CompletableFuture.supplyAsync(() -> {
int successCount = 0;
int requiredSuccesses = (dcFutures.size() / 2) + 1;
for (CompletableFuture<Boolean> future : dcFutures) {
try {
if (future.get(5, TimeUnit.SECONDS)) {
successCount++;
if (successCount >= requiredSuccesses) {
return true;
}
}
} catch (Exception e) {
logger.warn("Error waiting for DC replication", e);
}
}
return successCount >= requiredSuccesses;
});
case ANY:
// 至少一个DC成功
return CompletableFuture.supplyAsync(() -> {
for (CompletableFuture<Boolean> future : dcFutures) {
try {
if (future.get(5, TimeUnit.SECONDS)) {
return true;
}
} catch (Exception e) {
logger.warn("Error waiting for DC replication", e);
}
}
return false;
});
case ASYNC:
// 异步复制,不等待结果
return CompletableFuture.completedFuture(true);
default:
logger.warn("Unknown replication strategy: {}, using QUORUM", strategy);
return CompletableFuture.supplyAsync(() -> {
int successCount = 0;
int requiredSuccesses = (dcFutures.size() / 2) + 1;
for (CompletableFuture<Boolean> future : dcFutures) {
try {
if (future.get(5, TimeUnit.SECONDS)) {
successCount++;
if (successCount >= requiredSuccesses) {
return true;
}
}
} catch (Exception e) {
logger.warn("Error waiting for DC replication", e);
}
}
return successCount >= requiredSuccesses;
});
}
}
// 定期同步数据中心之间的变更
private void replicateChanges() {
try {
// 获取当前复制进度
Map<String, Long> progress = new HashMap<>();
for (var entry : dcReplicationProgress.entrySet()) {
progress.put(entry.getKey(), entry.getValue().get());
}
// 对每个DC,复制尚未同步的变更
for (var entry : dcConnections.entrySet()) {
String dc = entry.getKey();
DCConnection connection = entry.getValue();
long currentProgress = progress.get(dc);
if (currentProgress < replicationIndex.get()) {
// 查找需要复制的变更
List<ReplicationEntry> changes =
getChangesSince(currentProgress, replicationIndex.get());
if (!changes.isEmpty()) {
connection.replicateChanges(changes)
.thenAccept(lastIndex -> {
if (lastIndex > currentProgress) {
// 更新复制进度
dcReplicationProgress.get(dc).updateAndGet(
current -> Math.max(current, lastIndex));
logger.info("Replicated changes to DC {} up to index {}",
dc, lastIndex);
}
})
.exceptionally(e -> {
logger.error("Failed to replicate changes to DC {}", dc, e);
return null;
});
}
}
}
} catch (Exception e) {
logger.error("Error in replication task", e);
}
}
// 检查数据中心健康状态
private void checkDCHealth() {
for (var entry : dcConnections.entrySet()) {
String dc = entry.getKey();
DCConnection connection = entry.getValue();
connection.checkHealth()
.thenAccept(healthy -> {
if (healthy) {
if (connection.markHealthy()) {
logger.info("DC {} is now healthy", dc);
}
} else {
if (connection.markUnhealthy()) {
logger.warn("DC {} is now unhealthy", dc);
}
}
})
.exceptionally(e -> {
logger.error("Error checking health of DC {}", dc, e);
connection.markUnhealthy();
return null;
});
}
}
// 获取指定范围内的变更
private List<ReplicationEntry> getChangesSince(long fromIndex, long toIndex) {
// 实际实现应从日志存储中检索变更
List<ReplicationEntry> changes = new ArrayList<>();
// 简化示例
for (long i = fromIndex + 1; i <= toIndex; i++) {
// 模拟获取变更
changes.add(new ReplicationEntry(i, null));
}
return changes;
}
@Override
public void close() {
// 关闭调度器
scheduler.shutdownNow();
try {
if (!scheduler.awaitTermination(5, TimeUnit.SECONDS)) {
logger.warn("Scheduler did not terminate in time");
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
logger.warn("Interrupted while waiting for scheduler termination");
}
// 关闭所有DC连接
for (DCConnection connection : dcConnections.values()) {
connection.close();
}
}
// 数据中心连接类
private class DCConnection implements AutoCloseable {
private final String dcId;
private final DCConnectionConfig config;
private final AtomicBoolean healthy = new AtomicBoolean(true);
private final NetworkClient networkClient;
public DCConnection(String dcId, DCConnectionConfig config) {
this.dcId = dcId;
this.config = config;
this.networkClient = createNetworkClient();
}
private NetworkClient createNetworkClient() {
// 创建用于跨DC通信的网络客户端
// 简化示例
return null;
}
public CompletableFuture<Boolean> replicateRequest(Request request, long index) {
if (!healthy.get()) {
return CompletableFuture.completedFuture(false);
}
// 实际实现中,将请求发送到目标DC
return CompletableFuture.completedFuture(true);
}
public CompletableFuture<Long> replicateChanges(List<ReplicationEntry> changes) {
if (!healthy.get() || changes.isEmpty()) {
return CompletableFuture.completedFuture(0L);
}
// 实际实现中,将变更批量发送到目标DC
long lastIndex = changes.get(changes.size() - 1).getIndex();
return CompletableFuture.completedFuture(lastIndex);
}
public CompletableFuture<Boolean> checkHealth() {
// 实际实现中,执行健康检查
return CompletableFuture.completedFuture(true);
}
public boolean markHealthy() {
return healthy.compareAndSet(false, true);
}
public boolean markUnhealthy() {
return healthy.compareAndSet(true, false);
}
@Override
public void close() {
// 关闭网络客户端
}
}
// 复制条目
private static class ReplicationEntry {
private final long index;
private final byte[] data;
public ReplicationEntry(long index, byte[] data) {
this.index = index;
this.data = data != null ? data.clone() : null;
}
public long getIndex() {
return index;
}
public byte[] getData() {
return data != null ? data.clone() : null;
}
}
// 数据中心连接配置
public static class DCConnectionConfig {
private final String primaryEndpoint;
private final List<String> backupEndpoints;
private final int connectTimeoutMs;
private final int readTimeoutMs;
public DCConnectionConfig(String primaryEndpoint, List<String> backupEndpoints,
int connectTimeoutMs, int readTimeoutMs) {
this.primaryEndpoint = primaryEndpoint;
this.backupEndpoints = new ArrayList<>(backupEndpoints);
this.connectTimeoutMs = connectTimeoutMs;
this.readTimeoutMs = readTimeoutMs;
}
// Getters...
}
// 复制策略
public enum ReplicationStrategy {
ALL, // 所有DC必须成功
QUORUM, // 多数DC必须成功
ANY, // 至少一个DC成功
ASYNC // 异步复制,不等待结果
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
- 246.
- 247.
- 248.
- 249.
- 250.
- 251.
- 252.
- 253.
- 254.
- 255.
- 256.
- 257.
- 258.
- 259.
- 260.
- 261.
- 262.
- 263.
- 264.
- 265.
- 266.
- 267.
- 268.
- 269.
- 270.
- 271.
- 272.
- 273.
- 274.
- 275.
- 276.
- 277.
- 278.
- 279.
- 280.
- 281.
- 282.
- 283.
- 284.
- 285.
- 286.
- 287.
- 288.
- 289.
- 290.
- 291.
- 292.
- 293.
- 294.
- 295.
- 296.
- 297.
- 298.
- 299.
- 300.
- 301.
- 302.
- 303.
- 304.
- 305.
- 306.
- 307.
- 308.
- 309.
- 310.
- 311.
- 312.
- 313.
- 314.
- 315.
- 316.
- 317.
- 318.
- 319.
- 320.
- 321.
- 322.
- 323.
- 324.
- 325.
- 326.
- 327.
- 328.
- 329.
- 330.
- 331.
- 332.
- 333.
- 334.
- 335.
- 336.
- 337.
- 338.
- 339.
- 340.
- 341.
- 342.
- 343.
- 344.
- 345.
- 346.
- 347.
- 348.
- 349.
- 350.
- 351.
- 352.
- 353.
- 354.
- 355.
- 356.
- 357.
- 358.
- 359.
- 360.
- 361.
- 362.
- 363.
- 364.
- 365.
- 366.
- 367.
- 368.
- 369.
- 370.
- 371.
- 372.
- 373.
- 374.
- 375.
- 376.
- 377.
- 378.
- 379.
- 380.
- 381.
- 382.
- 383.
- 384.
- 385.
- 386.
- 387.
- 388.
- 389.
- 390.
- 391.
- 392.
五、ZAB 与 Paxos 的联系与区别
联系
两者共同点:
- 多数派机制:都需要超过半数节点的确认以保证安全性,防止脑裂
- 阶段性操作:都分为准备和提交/接受两个主要阶段来达成共识
- 安全性保证:在任何情况下都不会出现数据不一致的状态
- 容错能力:都能在部分节点失败的情况下继续工作
- 对网络分区的处理:在网络分区情况下保证安全性,宁可停止服务也不破坏一致性
区别
关键区别:
- 设计目标:
- ZAB:专为 ZooKeeper 设计的状态机复制协议,强调系统整体的复制和顺序性
- Paxos:通用的分布式共识算法,关注对单一值的决议过程
- 主从关系:
- ZAB:明确的 Leader-Follower 架构,强调中心化处理
- Basic Paxos:原始设计中角色对称,没有固定 Leader
- Multi-Paxos:引入了 Leader 优化,但在理论上保持角色对称性
- 消息顺序:
- ZAB:保证 FIFO 严格顺序处理,使用 ZXID(epoch + counter)保证全局顺序
- Basic Paxos:不保证顺序,只关注单值共识
- Multi-Paxos:可以通过实例 ID 保证顺序,但需要额外机制
- 恢复机制:
- ZAB:有专门的崩溃恢复模式,包括选举、发现、同步和激活等阶段
- Paxos:通过常规算法流程处理崩溃恢复,没有特殊的恢复模式
- 事务标识:
- ZAB:使用 ZXID(epoch + counter)作为全局唯一标识
- Paxos:使用提案编号(ballot number)和实例 ID 分别标识提案和位置
六、性能对比与工程实践
性能对比
指标 | ZAB 协议 | Basic Paxos | Multi-Paxos |
写入延迟 | 2RTT (正常模式) | 2RTT | 1RTT (有稳定 Leader) |
读取延迟 | 0RTT (本地读) - 1RTT (一致性读) | 2RTT | 0RTT (本地读) - 1RTT (一致性读) |
写入吞吐量 | 高 (批处理优化) | 低 | 中-高 (批处理优化) |
读取吞吐量 | 非常高 (本地读) | 低 | 高 (本地读) |
消息复杂度 | O(n) | O(n²) | O(n) (稳定 Leader) |
CPU 消耗 | 中等 | 高 | 中-高 |
内存占用 | 中等 | 中等 | 中等 |
恢复时间 | 较短 (专门的恢复机制) | 较长 | 中等 |
横向可扩展性
随着集群规模增加,性能变化情况:
集群规模 | ZAB 协议 | Paxos 算法 |
3 节点 | 高吞吐量,低延迟 | 中等吞吐量,中等延迟 |
5 节点 | 良好吞吐量,轻微增加的延迟 | 吞吐量下降,延迟增加 |
7 节点 | 吞吐量下降,延迟增加 | 显著吞吐量下降,高延迟 |
9+节点 | 不建议(性能下降明显) | 不建议(性能下降明显) |
JVM 调优建议
/**
* 推荐的JVM参数设置:
* -Xms4g -Xmx4g // 固定堆大小避免动态调整
* -XX:+UseG1GC // 使用G1垃圾收集器
* -XX:MaxGCPauseMillis=200 // 最大GC暂停时间
* -XX:InitiatingHeapOccupancyPercent=45 // GC启动阈值
* -XX:+AlwaysPreTouch // 预分配内存页
* -XX:+DisableExplicitGC // 禁用显式GC调用
* -XX:+HeapDumpOnOutOfMemoryError // OOM时生成堆转储
* -XX:HeapDumpPath=/path/to/dumps // 堆转储路径
* -XX:+UseCompressedOops // 使用压缩指针
* -XX:+UseCompressedClassPointers // 使用压缩类指针
* -Djava.net.preferIPv4Stack=true // 优先使用IPv4
*/
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
选型决策
工程实践最佳建议
- 合理设置超时参数:
- 过短的超时会导致不必要的选举
- 过长的超时会增加故障恢复时间
- 建议根据网络环境动态调整
- 批处理请求:
- 合并多个写请求为一个批次
- 减少网络往返次数
- 提高整体吞吐量
- 读写分离:
- 写请求经过 Leader
- 读请求可以在本地处理(根据一致性需求)
- 使用读取缓存减少磁盘 IO
- 监控关键指标:
- 提交延迟
- Leader 切换频率
- 请求排队深度
- 网络延迟和带宽使用
- 预防性维护:
- 定期压缩日志
- 创建快照
- 监控磁盘空间
- 进行故障演练测试恢复流程
七、单元测试示例
@RunWith(MockitoJUnitRunner.class)
public class ZABBroadcastTest {
private ZABBroadcast zabBroadcast;
private AtomicLong zxid;
private AtomicInteger epoch;
@Mock
private NetworkClient mockNetworkClient;
@Mock
private StateMachine mockStateMachine;
@Before
public void setUp() {
zxid = new AtomicLong(0);
epoch = new AtomicInteger(0);
zabBroadcast = new ZABBroadcast("server1", zxid, epoch, mockNetworkClient, mockStateMachine);
// 添加follower
ServerData follower1 = new ServerData("server2", "localhost", 8001);
ServerData follower2 = new ServerData("server3", "localhost", 8002);
zabBroadcast.addFollower(follower1);
zabBroadcast.addFollower(follower2);
}
@After
public void tearDown() {
zabBroadcast.close();
}
@Test
public void testProcessWriteSuccess() throws Exception {
// 准备模拟对象行为
ACK successAck = new ACK(true, zxid.get() + 1);
when(mockNetworkClient.sendProposal(anyString(), any(ProposalPacket.class)))
.thenReturn(successAck);
// 执行测试
Request request = new Request("req1", "test data".getBytes());
CompletableFuture<Boolean> result = zabBroadcast.processWrite(request);
// 验证结果
assertTrue(result.get(1, TimeUnit.SECONDS));
// 验证交互
verify(mockNetworkClient, times(2)).sendProposal(anyString(), any(ProposalPacket.class));
verify(mockNetworkClient, times(2)).sendCommit(anyString(), any(CommitPacket.class));
}
@Test
public void testProcessWriteFailure() throws Exception {
// 准备模拟对象行为 - 一个成功,一个失败
when(mockNetworkClient.sendProposal(eq("server2"), any(ProposalPacket.class)))
.thenReturn(new ACK(true, zxid.get() + 1));
when(mockNetworkClient.sendProposal(eq("server3"), any(ProposalPacket.class)))
.thenReturn(new ACK(false, zxid.get()));
// 执行测试
Request request = new Request("req1", "test data".getBytes());
CompletableFuture<Boolean> result = zabBroadcast.processWrite(request);
// 验证结果 - 应该失败,因为没有多数派确认
assertFalse(result.get(1, TimeUnit.SECONDS));
// 验证交互 - 不应该发送commit
verify(mockNetworkClient, times(2)).sendProposal(anyString(), any(ProposalPacket.class));
verify(mockNetworkClient, never()).sendCommit(anyString(), any(CommitPacket.class));
}
@Test
public void testBatchWritePerformance() throws Exception {
// 准备模拟对象行为
ACK successAck = new ACK(true, zxid.get() + 1);
when(mockNetworkClient.sendProposal(anyString(), any(ProposalPacket.class)))
.thenReturn(successAck);
// 准备批处理请求
List<Request> requests = new ArrayList<>();
for (int i = 0; i < 100; i++) {
requests.add(new Request("req" + i, ("data" + i).getBytes()));
}
// 执行测试
Stopwatch stopwatch = Stopwatch.createStarted();
CompletableFuture<Map<String, Boolean>> result = zabBroadcast.processBatchWrite(requests);
Map<String, Boolean> results = result.get(5, TimeUnit.SECONDS);
stopwatch.stop();
// 验证结果
assertEquals(100, results.size());
assertTrue(results.values().stream().allMatch(v -> v));
// 打印性能数据
System.out.println("Batch write of 100 requests took " +
stopwatch.elapsed(TimeUnit.MILLISECONDS) + "ms");
// 验证交互 - 只应该有一次网络往返
verify(mockNetworkClient, times(2)).sendProposal(anyString(), any(ProposalPacket.class));
verify(mockNetworkClient, times(2)).sendCommit(anyString(), any(CommitPacket.class));
}
@Test
public void testCircuitBreakerTrip() throws Exception {
// 准备模拟对象行为 - 总是失败
when(mockNetworkClient.sendProposal(anyString(), any(ProposalPacket.class)))
.thenReturn(new ACK(false, zxid.get()));
// 执行多次请求,触发断路器
Request request = new Request("req1", "test data".getBytes());
for (int i = 0; i < 5; i++) {
try {
CompletableFuture<Boolean> result = zabBroadcast.processWrite(request);
result.get(1, TimeUnit.SECONDS);
} catch (Exception e) {
// 忽略预期中的异常
}
}
// 执行第6次请求,应该直接被断路器拒绝
try {
CompletableFuture<Boolean> result = zabBroadcast.processWrite(request);
result.get(1, TimeUnit.SECONDS);
fail("Should have thrown CircuitBreakerOpenException");
} catch (ExecutionException e) {
assertTrue(e.getCause() instanceof ProcessingException);
assertTrue(e.getCause().getCause() instanceof ZABBroadcast.CircuitBreakerOpenException);
}
}
@Test
public void testReadWithConsistencyLevels() throws Exception {
// 测试不同一致性级别的读取
when(mockNetworkClient.sendHeartbeat(anyString(), anyLong()))
.thenReturn();
// 执行线性一致性读取
CompletableFuture<Result> linearResult =
zabBroadcast.readWithConsistency("testKey", ConsistencyLevel.LINEARIZABLE);
// 执行顺序一致性读取
CompletableFuture<Result> sequentialResult =
zabBroadcast.readWithConsistency("testKey", ConsistencyLevel.SEQUENTIAL);
// 执行最终一致性读取
CompletableFuture<Result> eventualResult =
zabBroadcast.readWithConsistency("testKey", ConsistencyLevel.EVENTUAL);
// 验证所有请求都成功完成
assertNotNull(linearResult.get(1, TimeUnit.SECONDS));
assertNotNull(sequentialResult.get(1, TimeUnit.SECONDS));
assertNotNull(eventualResult.get(1, TimeUnit.SECONDS));
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
八、客户端 API 示例
public class DistributedSystemClient implements AutoCloseable {
private final ZabClient zabClient;
private final PaxosClient paxosClient;
private final Logger logger = LoggerFactory.getLogger(DistributedSystemClient.class);
public DistributedSystemClient(String zkConnectString, String paxosConnectString) {
this.zabClient = new ZabClient(zkConnectString);
this.paxosClient = new PaxosClient(paxosConnectString);
}
// ZAB客户端示例 - 配置服务
public class ZabClient implements AutoCloseable {
private final String connectString;
private final CuratorFramework client;
public ZabClient(String connectString) {
this.connectString = connectString;
this.client = CuratorFramework.builder()
.connectString(connectString)
.retryPolicy(new ExponentialBackoffRetry(1000, 3))
.build();
this.client.start();
}
// 存储配置
public void storeConfig(String path, String data) throws Exception {
try {
// 检查路径是否存在
if (client.checkExists().forPath(path) == null) {
client.create()
.creatingParentsIfNeeded()
.withMode(CreateMode.PERSISTENT)
.forPath(path, data.getBytes(StandardCharsets.UTF_8));
logger.info("Created config at path: {}", path);
} else {
client.setData()
.forPath(path, data.getBytes(StandardCharsets.UTF_8));
logger.info("Updated config at path: {}", path);
}
} catch (Exception e) {
logger.error("Failed to store config at path: {}", path, e);
throw e;
}
}
// 读取配置
public String getConfig(String path) throws Exception {
try {
byte[] data = client.getData().forPath(path);
return new String(data, StandardCharsets.UTF_8);
} catch (Exception e) {
logger.error("Failed to read config from path: {}", path, e);
throw e;
}
}
// 监听配置变化
public void watchConfig(String path, Consumer<String> changeCallback) throws Exception {
try {
// 设置监听器
client.getData().usingWatcher(new CuratorWatcher() {
@Override
public void process(WatchedEvent event) throws Exception {
if (event.getType() == EventType.NodeDataChanged) {
String newData = getConfig(path);
changeCallback.accept(newData);
// 重新设置监听
watchConfig(path, changeCallback);
}
}
}).forPath(path);
logger.info("Set watch on path: {}", path);
} catch (Exception e) {
logger.error("Failed to set watch on path: {}", path, e);
throw e;
}
}
// 分布式锁
public DistributedLock getLock(String lockPath) {
return new DistributedLock(client, lockPath);
}
@Override
public void close() {
client.close();
}
// 分布式锁实现
public class DistributedLock {
private final InterProcessMutex mutex;
private final String lockPath;
public DistributedLock(CuratorFramework client, String lockPath) {
this.lockPath = lockPath;
this.mutex = new InterProcessMutex(client, lockPath);
}
public void lock(long timeout, TimeUnit unit) throws Exception {
if (mutex.acquire(timeout, unit)) {
logger.info("Acquired lock: {}", lockPath);
} else {
logger.warn("Failed to acquire lock: {} within timeout", lockPath);
throw new TimeoutException("Failed to acquire lock: " + lockPath);
}
}
public void unlock() {
try {
mutex.release();
logger.info("Released lock: {}", lockPath);
} catch (Exception e) {
logger.error("Error releasing lock: {}", lockPath, e);
}
}
}
}
// Paxos客户端示例 - 分布式KV存储
public class PaxosClient implements AutoCloseable {
private final String connectString;
private final PaxosKVStore kvStore;
public PaxosClient(String connectString) {
this.connectString = connectString;
this.kvStore = new PaxosKVStore(connectString);
}
// 写入键值对
public CompletableFuture<Boolean> put(String key, String value,
ConsistencyLevel consistencyLevel) {
return kvStore.put(key, value, consistencyLevel);
}
// 读取键值
public CompletableFuture<String> get(String key, ConsistencyLevel consistencyLevel) {
return kvStore.get(key, consistencyLevel);
}
// 删除键
public CompletableFuture<Boolean> delete(String key) {
return kvStore.delete(key);
}
@Override
public void close() {
kvStore.close();
}
// Paxos KV存储实现
private class PaxosKVStore implements AutoCloseable {
private final PaxosClient client;
public PaxosKVStore(String connectString) {
// 实际实现会连接到Paxos集群
this.client = null; // 简化示例
}
public CompletableFuture<Boolean> put(String key, String value,
ConsistencyLevel consistencyLevel) {
// 实际实现会通过Paxos协议提交写请求
logger.info("Putting key: {} with consistency: {}", key, consistencyLevel);
return CompletableFuture.completedFuture(true);
}
public CompletableFuture<String> get(String key, ConsistencyLevel consistencyLevel) {
// 实际实现会根据一致性级别选择读取策略
logger.info("Getting key: {} with consistency: {}", key, consistencyLevel);
return CompletableFuture.completedFuture("value");
}
public CompletableFuture<Boolean> delete(String key) {
// 删除操作也是写操作,通过Paxos协议提交
logger.info("Deleting key: {}", key);
return CompletableFuture.completedFuture(true);
}
@Override
public void close() {
// 释放资源
}
}
}
// 使用示例
public void runExample() throws Exception {
// ZAB客户端使用示例
try (ZabClient zab = new ZabClient("localhost:2181")) {
// 存储配置
zab.storeConfig("/app/config", "{\"timeout\": 30, \"maxRetries\": 3}");
// 读取配置
String config = zab.getConfig("/app/config");
System.out.println("Config: " + config);
// 监听配置变化
zab.watchConfig("/app/config", newConfig -> {
System.out.println("Config changed: " + newConfig);
});
// 使用分布式锁
ZabClient.DistributedLock lock = zab.getLock("/app/locks/resource1");
try {
lock.lock(10, TimeUnit.SECONDS);
// 临界区操作
System.out.println("Performing critical operation...");
Thread.sleep(1000);
} finally {
lock.unlock();
}
}
// Paxos客户端使用示例
try (PaxosClient paxos = new PaxosClient("localhost:8000,localhost:8001,localhost:8002")) {
// 写入数据
paxos.put("user:1001", "{\"name\":\"John\",\"email\":\"john@example.com\"}",
ConsistencyLevel.LINEARIZABLE)
.thenAccept(success -> {
System.out.println("Write success: " + success);
})
.join();
// 读取数据
paxos.get("user:1001", ConsistencyLevel.SEQUENTIAL)
.thenAccept(value -> {
System.out.println("User data: " + value);
})
.join();
// 删除数据
paxos.delete("user:1001")
.thenAccept(success -> {
System.out.println("Delete success: " + success);
})
.join();
}
}
@Override
public void close() throws Exception {
zabClient.close();
paxosClient.close();
}
}
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
- 210.
- 211.
- 212.
- 213.
- 214.
- 215.
- 216.
- 217.
- 218.
- 219.
- 220.
- 221.
- 222.
- 223.
- 224.
- 225.
- 226.
- 227.
- 228.
- 229.
- 230.
- 231.
- 232.
- 233.
- 234.
- 235.
- 236.
- 237.
- 238.
- 239.
- 240.
- 241.
- 242.
- 243.
- 244.
- 245.
九、总结
特性 | ZAB 协议 | Paxos 算法 |
设计目标 | 状态机复制 | 分布式共识 |
主从关系 | 明确的 Leader-Follower | Basic Paxos 无固定角色,Multi-Paxos 有 Leader 优化 |
消息顺序 | 严格 FIFO 顺序 | Basic Paxos 不保证顺序,Multi-Paxos 可保证 |
恢复机制 | 专门的恢复模式 | 通过算法自身流程恢复 |
实现复杂度 | 中等 | 高 |
适用场景 | 需要强一致性和顺序保证的系统 | 通用的分布式系统,尤其是对单值决议 |
典型应用 | ZooKeeper | Chubby, etcd(Raft 变种) |
性能特点 | 写操作需经过 Leader,读性能高 | 基本实现较慢,优化后可获得高性能 |
扩展性 | 受限于 ZooKeeper 架构 | 基础理论更易扩展和变形 |
ZAB 和 Paxos 都是优秀的分布式一致性算法,在现代分布式系统设计中占据核心地位。理解它们的工作原理、实现细节和适用场景,对构建可靠的分布式系统至关重要。
无论选择哪种算法,都需要根据具体应用场景、一致性需求和性能要求进行权衡。通过本文展示的工程实践和优化技术,开发者可以构建出高性能、高可靠的分布式系统,满足各种复杂业务场景的需求。