架构之高可用

原创于 2025-12-07 21:52:52 发布 · 621 阅读

18 ·

CC 4.0 BY-SA版权

文章标签：

#架构 #高可用

Architect 专栏收录该内容

15 篇文章

订阅专栏

架构之高可用

引言

在数字化时代，系统的可用性直接关系到企业的生存和发展。一个高可用的系统能够持续为用户提供服务，即使在面临各种故障和挑战时也能保持正常运行。高可用架构设计是分布式系统架构中最重要的度量标准之一。

高可用法则强调：通过消除单点故障、建立冗余机制、实现故障自动恢复，确保系统在各种异常情况下仍能保持服务的连续性和稳定性。这不仅是对技术架构的要求，更是对业务连续性的保障。

高可用架构的核心理念

什么是高可用？

高可用（High Availability，简称HA）是指系统在面对各种故障和异常情况时，仍能保持持续提供服务的能力。业界通常用"几个9"来衡量系统的可用性：

不可用情况的分类

既然有可用率，就一定会存在不可用的情况。不可用一般分为有计划的和无计划的：

高可用架构的目标

高可用架构的核心目标是消除单点故障（Single Point of Failure, SPOF），这包括：

IDC机房内部的单点故障：单个服务器、单个网络设备、单个存储设备等
IDC机房之间的单点故障：单个数据中心、单个地域的故障

高可用架构设计原则

1. 冗余设计原则

通过建立多重备份来确保系统的可靠性。

硬件冗余实现

// 双机热备配置
@Configuration
public class HighAvailabilityConfig {
    
    @Bean
    public LoadBalancerClient loadBalancerClient() {
        return new RoundRobinLoadBalancer();
    }
    
    @Bean
    @Primary
    public DataSource primaryDataSource() {
        HikariConfig config = new HikariConfig();
        config.setJdbcUrl("jdbc:mysql://primary-db:3306/myapp");
        config.setUsername("root");
        config.setPassword("password");
        config.setMaximumPoolSize(20);
        config.setMinimumIdle(5);
        return new HikariDataSource(config);
    }
    
    @Bean
    public DataSource standbyDataSource() {
        HikariConfig config = new HikariConfig();
        config.setJdbcUrl("jdbc:mysql://standby-db:3306/myapp");
        config.setUsername("root");
        config.setPassword("password");
        config.setMaximumPoolSize(20);
        config.setMinimumIdle(5);
        return new HikariDataSource(config);
    }
}

// 数据库连接池故障转移
@Component
public class FailoverDataSource implements DataSource {
    
    private final List<DataSource> dataSources;
    private final AtomicInteger currentIndex = new AtomicInteger(0);
    
    public FailoverDataSource(List<DataSource> dataSources) {
        this.dataSources = dataSources;
    }
    
    @Override
    public Connection getConnection() throws SQLException {
        int attempts = 0;
        int size = dataSources.size();
        
        while (attempts < size) {
            int index = (currentIndex.get() + attempts) % size;
            DataSource dataSource = dataSources.get(index);
            
            try {
                Connection connection = dataSource.getConnection();
                currentIndex.set(index);
                return connection;
            } catch (SQLException e) {
                attempts++;
                if (attempts >= size) {
                    throw new SQLException("所有数据源都不可用", e);
                }
            }
        }
        
        throw new SQLException("无法获取数据库连接");
    }
}

2. 故障隔离原则

通过隔离机制防止故障的扩散和影响。

线程池隔离实现

// 线程池隔离配置
@Configuration
public class ThreadPoolIsolationConfig {
    
    @Bean("userServiceExecutor")
    public Executor userServiceExecutor() {
        ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
        executor.setCorePoolSize(10);
        executor.setMaxPoolSize(50);
        executor.setQueueCapacity(1000);
        executor.setThreadNamePrefix("UserService-");
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
        executor.initialize();
        return executor;
    }
    
    @Bean("orderServiceExecutor")
    public Executor orderServiceExecutor() {
        ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
        executor.setCorePoolSize(15);
        executor.setMaxPoolSize(75);
        executor.setQueueCapacity(1500);
        executor.setThreadNamePrefix("OrderService-");
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
        executor.initialize();
        return executor;
    }
    
    @Bean("paymentServiceExecutor")
    public Executor paymentServiceExecutor() {
        ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor();
        executor.setCorePoolSize(5);
        executor.setMaxPoolSize(25);
        executor.setQueueCapacity(500);
        executor.setThreadNamePrefix("PaymentService-");
        executor.setRejectedExecutionHandler(new ThreadPoolExecutor.CallerRunsPolicy());
        executor.initialize();
        return executor;
    }
}

// 服务隔离实现
@Service
public class IsolatedService {
    
    @Autowired
    @Qualifier("userServiceExecutor")
    private Executor userServiceExecutor;
    
    @Autowired
    @Qualifier("orderServiceExecutor")
    private Executor orderServiceExecutor;
    
    public CompletableFuture<User> getUserAsync(String userId) {
        return CompletableFuture.supplyAsync(() -> {
            // 用户服务逻辑
            return userRepository.findById(userId);
        }, userServiceExecutor);
    }
    
    public CompletableFuture<Order> createOrderAsync(CreateOrderRequest request) {
        return CompletableFuture.supplyAsync(() -> {
            // 订单服务逻辑
            return orderService.createOrder(request);
        }, orderServiceExecutor);
    }
}

3. 自动恢复原则

系统能够自动检测故障并进行恢复。

健康检查实现

// 健康检查配置
@Component
public class HealthCheckService {
    
    @Autowired
    private List<HealthIndicator> healthIndicators;
    
    private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
    private final Map<String, HealthStatus> healthStatusMap = new ConcurrentHashMap<>();
    
    @PostConstruct
    public void init() {
        // 启动定期健康检查
        scheduler.scheduleWithFixedDelay(this::performHealthChecks, 0, 30, TimeUnit.SECONDS);
    }
    
    public void performHealthChecks() {
        for (HealthIndicator indicator : healthIndicators) {
            try {
                HealthStatus status = indicator.checkHealth();
                healthStatusMap.put(indicator.getName(), status);
                
                if (status.isUnhealthy()) {
                    handleUnhealthyService(indicator.getName(), status);
                }
            } catch (Exception e) {
                log.error("健康检查失败: {}", indicator.getName(), e);
                healthStatusMap.put(indicator.getName(), HealthStatus.down("检查异常: " + e.getMessage()));
            }
        }
    }
    
    private void handleUnhealthyService(String serviceName, HealthStatus status) {
        log.warn("服务 {} 健康检查失败: {}", serviceName, status.getDetails());
        
        // 根据故障类型执行恢复策略
        switch (status.getFailureType()) {
            case TIMEOUT:
                handleTimeout(serviceName);
                break;
            case CONNECTION_FAILURE:
                handleConnectionFailure(serviceName);
                break;
            case RESOURCE_EXHAUSTION:
                handleResourceExhaustion(serviceName);
                break;
            case DEPENDENCY_FAILURE:
                handleDependencyFailure(serviceName);
                break;
            default:
                handleGenericFailure(serviceName);
        }
    }
    
    private void handleTimeout(String serviceName) {
        // 增加超时时间或重试
        log.info("处理服务 {} 超时问题", serviceName);
    }
    
    private void handleConnectionFailure(String serviceName) {
        // 尝试重新连接或切换到备用服务
        log.info("处理服务 {} 连接失败", serviceName);
    }
    
    private void handleResourceExhaustion(String serviceName) {
        // 释放资源或增加资源
        log.info("处理服务 {} 资源耗尽", serviceName);
    }
    
    private void handleDependencyFailure(String serviceName) {
        // 检查依赖服务状态
        log.info("处理服务 {} 依赖故障", serviceName);
    }
    
    private void handleGenericFailure(String serviceName) {
        // 通用故障处理：重启服务
        log.info("重启服务 {}", serviceName);
        restartService(serviceName);
    }
    
    private void restartService(String serviceName) {
        // 实现服务重启逻辑
        try {
            // 优雅关闭
            gracefullyShutdownService(serviceName);
            
            // 等待一段时间
            Thread.sleep(5000);
            
            // 启动服务
            startService(serviceName);
            
            log.info("服务 {} 重启完成", serviceName);
        } catch (Exception e) {
            log.error("服务 {} 重启失败", serviceName, e);
        }
    }
}

// 自定义健康检查指标
@Component
public class DatabaseHealthIndicator implements HealthIndicator {
    
    @Autowired
    private DataSource dataSource;
    
    @Override
    public String getName() {
        return "database";
    }
    
    @Override
    public HealthStatus checkHealth() {
        try (Connection connection = dataSource.getConnection()) {
            // 检查数据库连接
            if (connection.isValid(5)) {
                // 执行简单查询测试
                PreparedStatement stmt = connection.prepareStatement("SELECT 1");
                ResultSet rs = stmt.executeQuery();
                if (rs.next()) {
                    return HealthStatus.up("数据库连接正常");
                }
            }
            return HealthStatus.down("数据库连接无效");
        } catch (SQLException e) {
            return HealthStatus.down("数据库连接失败: " + e.getMessage());
        }
    }
}

高可用架构核心技术

1. 负载均衡与故障转移

通过负载均衡器实现请求的合理分发和故障自动转移。

Nginx高可用配置

# Nginx高可用负载均衡配置
upstream backend_servers {
    # 定义后端服务器池
    server 192.168.1.10:8080 weight=3 max_fails=3 fail_timeout=30s;
    server 192.168.1.11:8080 weight=2 max_fails=3 fail_timeout=30s;
    server 192.168.1.12:8080 weight=1 max_fails=3 fail_timeout=30s backup; # 备用服务器
    
    # 负载均衡算法
    least_conn;  # 最少连接数算法
    
    # 健康检查配置
    keepalive 32;
    keepalive_timeout 60s;
    keepalive_requests 100;
}

# 高可用虚拟服务器
server {
    listen 80;
    server_name ha.example.com;
    
    # 启用健康检查
    location /nginx-health {
        access_log off;
        return 200 "healthy\n";
        add_header Content-Type text/plain;
    }
    
    location / {
        proxy_pass http://backend_servers;
        proxy_set_header Host $host;
        proxy_set_header X-Real-IP $remote_addr;
        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
        proxy_set_header X-Forwarded-Proto $scheme;
        
        # 超时设置
        proxy_connect_timeout 5s;
        proxy_send_timeout 10s;
        proxy_read_timeout 10s;
        
        # 缓冲区设置
        proxy_buffering on;
        proxy_buffer_size 4k;
        proxy_buffers 8 4k;
        proxy_busy_buffers_size 8k;
        
        # 失败重试
        proxy_next_upstream error timeout invalid_header http_500 http_502 http_503 http_504;
        proxy_next_upstream_tries 3;
        proxy_next_upstream_timeout 10s;
    }
}

# Keepalived配置示例（VRRP）
vrrp_script chk_nginx {
    script "/etc/keepalived/check_nginx.sh"
    interval 2
    weight -20
}

vrrp_instance VI_1 {
    state MASTER
    interface eth0
    virtual_router_id 51
    priority 100
    advert_int 1
    
    authentication {
        auth_type PASS
        auth_pass 1111
    }
    
    virtual_ipaddress {
        192.168.1.100/24 dev eth0
    }
    
    track_script {
        chk_nginx
    }
}

应用层负载均衡实现

// Spring Cloud LoadBalancer配置
@Configuration
@LoadBalancerClient(name = "user-service", configuration = UserServiceLoadBalancerConfig.class)
public class LoadBalancerConfig {
    
    @Bean
    public ReactorLoadBalancer<ServiceInstance> userServiceLoadBalancer(
            Environment environment,
            LoadBalancerClientFactory loadBalancerClientFactory) {
        String name = environment.getProperty(LoadBalancerClientFactory.PROPERTY_NAME);
        return new RoundRobinLoadBalancer(loadBalancerClientFactory
                .getLazyProvider(name, ServiceInstanceListSupplier.class), name);
    }
}

// 自定义负载均衡策略
@Component
public class HealthAwareLoadBalancer implements ReactorServiceInstanceLoadBalancer {
    
    private final String serviceId;
    private final ServiceInstanceListSupplier serviceInstanceListSupplier;
    
    public HealthAwareLoadBalancer(ServiceInstanceListSupplier serviceInstanceListSupplier, String serviceId) {
        this.serviceId = serviceId;
        this.serviceInstanceListSupplier = serviceInstanceListSupplier;
    }
    
    @Override
    public Mono<Response<ServiceInstance>> choose(Request request) {
        return serviceInstanceListSupplier.get().next()
                .map(serviceInstances -> getInstanceResponse(serviceInstances));
    }
    
    private Response<ServiceInstance> getInstanceResponse(List<ServiceInstance> instances) {
        if (instances.isEmpty()) {
            return new EmptyResponse();
        }
        
        // 过滤掉不健康的服务实例
        List<ServiceInstance> healthyInstances = instances.stream()
                .filter(this::isHealthy)
                .collect(Collectors.toList());
        
        if (healthyInstances.isEmpty()) {
            // 如果没有健康实例，返回空响应
            return new EmptyResponse();
        }
        
        // 使用轮询算法选择实例
        int index = ThreadLocalRandom.current().nextInt(healthyInstances.size());
        ServiceInstance instance = healthyInstances.get(index);
        
        return new DefaultResponse(instance);
    }
    
    private boolean isHealthy(ServiceInstance instance) {
        // 实现健康检查逻辑
        try {
            String healthUrl = instance.getUri() + "/actuator/health";
            ResponseEntity<String> response = restTemplate.getForEntity(healthUrl, String.class);
            return response.getStatusCode() == HttpStatus.OK;
        } catch (Exception e) {
            log.warn("服务实例 {} 健康检查失败", instance.getUri(), e);
            return false;
        }
    }
}

2. 数据高可用架构

确保数据的可靠性和一致性是高可用架构的重要组成部分。

MySQL高可用配置

# MySQL主从复制配置
# 主库配置 (my.cnf)
[mysqld]
server-id=1
log-bin=mysql-bin
binlog-format=ROW
sync-binlog=1
innodb-flush-log-at-trx-commit=1

# 从库配置 (my.cnf)
[mysqld]
server-id=2
relay-log=mysql-relay-bin
read-only=1
super-read-only=1
skip-slave-start=1

# 主库创建复制用户
CREATE USER 'repl'@'%' IDENTIFIED BY 'password';
GRANT REPLICATION SLAVE ON *.* TO 'repl'@'%';

# 从库配置复制
CHANGE MASTER TO
    MASTER_HOST='master.example.com',
    MASTER_USER='repl',
    MASTER_PASSWORD='password',
    MASTER_LOG_FILE='mysql-bin.000001',
    MASTER_LOG_POS=107;

START SLAVE;

# 监控复制状态
SHOW SLAVE STATUS\G

---
# MHA (Master High Availability) 配置
[server default]
user=mha
password=password
manager_workdir=/var/log/mha/app1
manager_log=/var/log/mha/app1/manager.log
remote_workdir=/var/log/mha/app1
ssh_user=root
repl_user=repl
repl_password=password
ping_interval=1
ping_type=SELECT

[server1]
hostname=master.example.com
candidate_master=1

[server2]
hostname=slave1.example.com
candidate_master=1

[server3]
hostname=slave2.example.com
no_master=1

Redis高可用集群配置

# Redis哨兵模式配置
# sentinel.conf
port 26379
dir "/var/lib/redis"
sentinel monitor mymaster master.example.com 6379 2
sentinel down-after-milliseconds mymaster 5000
sentinel parallel-syncs mymaster 1
sentinel failover-timeout mymaster 10000
sentinel auth-pass mymaster password

# 启动哨兵
redis-sentinel /etc/redis/sentinel.conf

---
# Redis Cluster配置
# redis-cluster.conf
port 6379
cluster-enabled yes
cluster-config-file nodes-6379.conf
cluster-node-timeout 5000
cluster-require-full-coverage no
appendonly yes

# 创建集群
redis-cli --cluster create \
  192.168.1.10:6379 \
  192.168.1.11:6379 \
  192.168.1.12:6379 \
  192.168.1.13:6379 \
  192.168.1.14:6379 \
  192.168.1.15:6379 \
  --cluster-replicas 1

分布式数据库中间件

// 数据库分片与高可用
@Configuration
@MapperScan("com.example.mapper")
public class ShardingDataSourceConfig {
    
    @Bean
    public DataSource shardingDataSource() throws SQLException {
        // 配置真实数据源
        Map<String, DataSource> dataSourceMap = new HashMap<>();
        
        // 主库配置
        dataSourceMap.put("master0", createDataSource("master0.example.com", 3306));
        dataSourceMap.put("master1", createDataSource("master1.example.com", 3306));
        
        // 从库配置
        dataSourceMap.put("slave0", createDataSource("slave0.example.com", 3306));
        dataSourceMap.put("slave1", createDataSource("slave1.example.com", 3306));
        
        // 配置分片规则
        ShardingRuleConfiguration shardingRuleConfig = new ShardingRuleConfiguration();
        
        // 用户表分片配置
        TableRuleConfiguration userTableRuleConfig = new TableRuleConfiguration("user", "master${0..1}.user_${0..1}");
        userTableRuleConfig.setDatabaseShardingStrategyConfig(new InlineShardingStrategyConfiguration("user_id", "master${user_id % 2}"));
        userTableRuleConfig.setTableShardingStrategyConfig(new InlineShardingStrategyConfiguration("user_id", "user_${user_id % 2}"));
        shardingRuleConfig.getTableRuleConfigs().add(userTableRuleConfig);
        
        // 读写分离配置
        MasterSlaveRuleConfiguration masterSlaveRuleConfig0 = new MasterSlaveRuleConfiguration("master0", "master0", Arrays.asList("slave0"));
        MasterSlaveRuleConfiguration masterSlaveRuleConfig1 = new MasterSlaveRuleConfiguration("master1", "master1", Arrays.asList("slave1"));
        
        shardingRuleConfig.setMasterSlaveRuleConfigs(Arrays.asList(masterSlaveRuleConfig0, masterSlaveRuleConfig1));
        
        // 创建数据源
        return ShardingDataSourceFactory.createDataSource(dataSourceMap, shardingRuleConfig, new Properties());
    }
    
    private DataSource createDataSource(String host, int port) {
        HikariConfig config = new HikariConfig();
        config.setDriverClassName("com.mysql.jdbc.Driver");
        config.setJdbcUrl(String.format("jdbc:mysql://%s:%d/myapp?useSSL=false&serverTimezone=UTC", host, port));
        config.setUsername("root");
        config.setPassword("password");
        config.setMaximumPoolSize(20);
        config.setMinimumIdle(5);
        config.setConnectionTimeout(30000);
        config.setIdleTimeout(600000);
        config.setMaxLifetime(1800000);
        return new HikariDataSource(config);
    }
}

// 读写分离路由
@Component
public class ReadWriteSplittingRouter {
    
    private static final ThreadLocal<Boolean> readOnlyContext = new ThreadLocal<>();
    
    public static void setReadOnly(boolean readOnly) {
        readOnlyContext.set(readOnly);
    }
    
    public static boolean isReadOnly() {
        return Boolean.TRUE.equals(readOnlyContext.get());
    }
    
    public static void clear() {
        readOnlyContext.remove();
    }
}

3. 服务高可用架构

微服务架构下的服务高可用设计。

服务注册与发现

// Eureka高可用配置
# eureka-server application.yml
spring:
  application:
    name: eureka-server

server:
  port: 8761

eureka:
  instance:
    hostname: eureka1.example.com
    prefer-ip-address: true
    lease-renewal-interval-in-seconds: 30
    lease-expiration-duration-in-seconds: 90
  client:
    register-with-eureka: true
    fetch-registry: true
    service-url:
      defaultZone: http://eureka2.example.com:8762/eureka/,http://eureka3.example.com:8763/eureka/
  server:
    enable-self-preservation: true
    renewal-percent-threshold: 0.85
    eviction-interval-timer-in-ms: 60000

---
# Eureka客户端配置
eureka:
  client:
    service-url:
      defaultZone: http://eureka1.example.com:8761/eureka/,http://eureka2.example.com:8762/eureka/,http://eureka3.example.com:8763/eureka/
    register-with-eureka: true
    fetch-registry: true
    registry-fetch-interval-seconds: 30
    heartbeat-executor-thread-pool-size: 5
    cache-refresh-executor-thread-pool-size: 5
  instance:
    prefer-ip-address: true
    lease-renewal-interval-in-seconds: 30
    lease-expiration-duration-in-seconds: 90
    metadata-map:
      zone: zone1
      environment: production

熔断器模式实现

// Hystrix熔断器配置
@Configuration
@EnableCircuitBreaker
public class HystrixConfig {
    
    @Bean
    public HystrixCommandProperties.Setter defaultHystrixProperties() {
        return HystrixCommandProperties.Setter()
                .withExecutionTimeoutInMilliseconds(5000)
                .withCircuitBreakerEnabled(true)
                .withCircuitBreakerRequestVolumeThreshold(20)
                .withCircuitBreakerSleepWindowInMilliseconds(10000)
                .withCircuitBreakerErrorThresholdPercentage(50)
                .withMetricsRollingStatisticalWindowInMilliseconds(10000)
                .withMetricsRollingStatisticalWindowBuckets(10);
    }
}

// 服务熔断实现
@Service
public class UserService {
    
    @HystrixCommand(
        fallbackMethod = "getUserFallback",
        commandProperties = {
            @HystrixProperty(name = "circuitBreaker.enabled", value = "true"),
            @HystrixProperty(name = "circuitBreaker.requestVolumeThreshold", value = "20"),
            @HystrixProperty(name = "circuitBreaker.sleepWindowInMilliseconds", value = "10000"),
            @HystrixProperty(name = "circuitBreaker.errorThresholdPercentage", value = "50"),
            @HystrixProperty(name = "execution.isolation.thread.timeoutInMilliseconds", value = "3000")
        },
        threadPoolProperties = {
            @HystrixProperty(name = "coreSize", value = "20"),
            @HystrixProperty(name = "maxQueueSize", value = "100")
        }
    )
    public User getUser(String userId) {
        // 调用远程服务
        return restTemplate.getForObject("http://user-service/users/" + userId, User.class);
    }
    
    public User getUserFallback(String userId) {
        log.warn("用户服务熔断，返回降级数据: {}", userId);
        return User.builder()
                .id(userId)
                .name("默认用户")
                .status("CIRCUIT_BREAKER_ACTIVE")
                .build();
    }
}

// Resilience4j熔断器配置
@Configuration
public class Resilience4jConfig {
    
    @Bean
    public Customizer<Resilience4JCircuitBreakerFactory> defaultCustomizer() {
        return factory -> factory.configureDefault(id -> {
            CircuitBreakerConfig circuitBreakerConfig = CircuitBreakerConfig.custom()
                    .failureRateThreshold(50)
                    .waitDurationInOpenState(Duration.ofMillis(1000))
                    .slidingWindowSize(20)
                    .minimumNumberOfCalls(10)
                    .build();
            
            TimeLimiterConfig timeLimiterConfig = TimeLimiterConfig.custom()
                    .timeoutDuration(Duration.ofSeconds(3))
                    .build();
            
            return new Resilience4JConfigBuilder(id)
                    .circuitBreakerConfig(circuitBreakerConfig)
                    .timeLimiterConfig(timeLimiterConfig)
                    .build();
        });
    }
}

限流保护机制

// 令牌桶限流算法
@Component
public class TokenBucketRateLimiter {
    
    private final Map<String, TokenBucket> buckets = new ConcurrentHashMap<>();
    
    public boolean tryAcquire(String key, int permits, int capacity, int refillRate) {
        TokenBucket bucket = buckets.computeIfAbsent(key, k -> new TokenBucket(capacity, refillRate));
        return bucket.tryAcquire(permits);
    }
    
    private static class TokenBucket {
        private final int capacity;
        private final int refillRate;
        private final AtomicInteger tokens;
        private final AtomicLong lastRefillTime;
        
        public TokenBucket(int capacity, int refillRate) {
            this.capacity = capacity;
            this.refillRate = refillRate;
            this.tokens = new AtomicInteger(capacity);
            this.lastRefillTime = new AtomicLong(System.currentTimeMillis());
        }
        
        public boolean tryAcquire(int permits) {
            refill();
            int currentTokens = tokens.get();
            if (currentTokens >= permits) {
                return tokens.compareAndSet(currentTokens, currentTokens - permits);
            }
            return false;
        }
        
        private void refill() {
            long now = System.currentTimeMillis();
            long lastRefill = lastRefillTime.get();
            long timePassed = now - lastRefill;
            
            if (timePassed > 0) {
                int tokensToAdd = (int) (timePassed * refillRate / 1000);
                if (tokensToAdd > 0) {
                    if (lastRefillTime.compareAndSet(lastRefill, now)) {
                        tokens.updateAndGet(current -> Math.min(capacity, current + tokensToAdd));
                    }
                }
            }
        }
    }
}

// 分布式限流（基于Redis）
@Component
public class DistributedRateLimiter {
    
    @Autowired
    private RedisTemplate<String, String> redisTemplate;
    
    private static final String RATE_LIMIT_KEY = "rate_limit:";
    
    public boolean isAllowed(String key, int maxRequests, int windowSeconds) {
        String redisKey = RATE_LIMIT_KEY + key;
        long now = System.currentTimeMillis();
        long windowStart = now - windowSeconds * 1000;
        
        // 使用Redis Lua脚本实现原子操作
        String luaScript = """
            local key = KEYS[1]
            local window_start = tonumber(ARGV[1])
            local now = tonumber(ARGV[2])
            local max_requests = tonumber(ARGV[3])
            local window_seconds = tonumber(ARGV[4])
            
            -- 清理过期的请求记录
            redis.call('ZREMRANGEBYSCORE', key, 0, window_start)
            
            -- 获取当前窗口内的请求数
            local current_requests = redis.call('ZCARD', key)
            
            -- 判断是否允许请求
            if current_requests < max_requests then
                -- 添加当前请求
                redis.call('ZADD', key, now, now)
                -- 设置过期时间
                redis.call('EXPIRE', key, window_seconds)
                return 1
            else
                return 0
            end
            """;
        
        Long result = redisTemplate.execute(
            new DefaultRedisScript<>(luaScript, Long.class),
            Collections.singletonList(redisKey),
            String.valueOf(windowStart),
            String.valueOf(now),
            String.valueOf(maxRequests),
            String.valueOf(windowSeconds)
        );
        
        return result != null && result == 1;
    }
}

// 限流切面
@Aspect
@Component
public class RateLimitAspect {
    
    @Autowired
    private DistributedRateLimiter rateLimiter;
    
    @Around("@annotation(rateLimit)")
    public Object around(ProceedingJoinPoint point, RateLimit rateLimit) throws Throwable {
        String key = generateKey(point, rateLimit);
        
        if (!rateLimiter.isAllowed(key, rateLimit.maxRequests(), rateLimit.windowSeconds())) {
            throw new RateLimitExceededException("请求过于频繁，请稍后再试");
        }
        
        return point.proceed();
    }
    
    private String generateKey(ProceedingJoinPoint point, RateLimit rateLimit) {
        StringBuilder key = new StringBuilder();
        key.append(rateLimit.key()).append(":");
        
        if (rateLimit.perUser()) {
            // 基于用户限流
            String userId = getCurrentUserId();
            key.append("user:").append(userId);
        } else if (rateLimit.perIp()) {
            // 基于IP限流
            String ip = getClientIp();
            key.append("ip:").append(ip);
        } else {
            // 基于方法限流
            String methodName = point.getSignature().toShortString();
            key.append("method:").append(methodName);
        }
        
        return key.toString();
    }
}

4. 多活架构设计

实现跨地域的多活架构，提供最高级别的可用性保障。

多活数据中心架构

# 多活架构配置
# 应用配置
spring:
  profiles:
    active: multi-active
  datasource:
    dynamic:
      primary: dc1
      strict: false
      datasource:
        dc1:
          url: jdbc:mysql://dc1-master:3306/myapp?useSSL=false
          username: root
          password: password
          driver-class-name: com.mysql.jdbc.Driver
        dc2:
          url: jdbc:mysql://dc2-master:3306/myapp?useSSL=false
          username: root
          password: password
          driver-class-name: com.mysql.jdbc.Driver
        dc3:
          url: jdbc:mysql://dc3-master:3306/myapp?useSSL=false
          username: root
          password: password
          driver-class-name: com.mysql.jdbc.Driver

# 数据中心配置
datacenter:
  current: dc1
  nodes:
    dc1:
      name: "数据中心1"
      region: "beijing"
      priority: 1
      weight: 50
      databases:
        - dc1-master
        - dc1-slave1
        - dc1-slave2
    dc2:
      name: "数据中心2"
      region: "shanghai"
      priority: 2
      weight: 30
      databases:
        - dc2-master
        - dc2-slave1
        - dc2-slave2
    dc3:
      name: "数据中心3"
      region: "shenzhen"
      priority: 3
      weight: 20
      databases:
        - dc3-master
        - dc3-slave1
        - dc3-slave2

# 数据同步配置
replication:
  mode: async
  lag-threshold: 1000
  conflict-resolution: last-write-wins
  sync-strategy:
    - dc1->dc2
    - dc1->dc3
    - dc2->dc1
    - dc2->dc3
    - dc3->dc1
    - dc3->dc2

全局流量调度

// 全局负载均衡器
@Component
public class GlobalLoadBalancer {
    
    @Autowired
    private DataCenterHealthChecker healthChecker;
    
    @Autowired
    private UserLocationService locationService;
    
    private final Map<String, DataCenter> dataCenters = new ConcurrentHashMap<>();
    
    public String selectDataCenter(String userId, String serviceName) {
        // 1. 获取用户地理位置
        UserLocation location = locationService.getUserLocation(userId);
        
        // 2. 获取健康的数据中心
        List<DataCenter> healthyDCs = getHealthyDataCenters();
        
        if (healthyDCs.isEmpty()) {
            throw new NoAvailableDataCenterException("没有可用的数据中心");
        }
        
        // 3. 基于地理位置和延迟选择最优数据中心
        DataCenter selectedDC = selectOptimalDataCenter(location, healthyDCs);
        
        log.info("为用户 {} 选择数据中心 {} (地理位置: {})", userId, selectedDC.getName(), location.getRegion());
        
        return selectedDC.getId();
    }
    
    private List<DataCenter> getHealthyDataCenters() {
        return dataCenters.values().stream()
                .filter(dc -> healthChecker.isHealthy(dc.getId()))
                .collect(Collectors.toList());
    }
    
    private DataCenter selectOptimalDataCenter(UserLocation location, List<DataCenter> dataCenters) {
        // 计算每个数据中心的综合评分
        return dataCenters.stream()
                .min(Comparator.comparingDouble(dc -> calculateScore(dc, location)))
                .orElse(dataCenters.get(0));
    }
    
    private double calculateScore(DataCenter dc, UserLocation location) {
        double latencyScore = getLatencyScore(dc, location);
        double loadScore = getLoadScore(dc);
        double capacityScore = getCapacityScore(dc);
        
        // 综合评分 = 延迟评分 * 0.5 + 负载评分 * 0.3 + 容量评分 * 0.2
        return latencyScore * 0.5 + loadScore * 0.3 + capacityScore * 0.2;
    }
    
    private double getLatencyScore(DataCenter dc, UserLocation location) {
        // 基于地理位置计算网络延迟评分
        long latency = NetworkLatencyCalculator.calculate(dc.getRegion(), location.getRegion());
        return Math.max(0, 100 - latency / 10.0);
    }
    
    private double getLoadScore(DataCenter dc) {
        // 获取数据中心负载情况
        double cpuUsage = healthChecker.getCpuUsage(dc.getId());
        double memoryUsage = healthChecker.getMemoryUsage(dc.getId());
        double avgLoad = (cpuUsage + memoryUsage) / 2;
        
        return Math.max(0, 100 - avgLoad);
    }
    
    private double getCapacityScore(DataCenter dc) {
        // 获取数据中心剩余容量
        double remainingCapacity = healthChecker.getRemainingCapacity(dc.getId());
        return remainingCapacity;
    }
}

// 数据中心健康检查
@Component
public class DataCenterHealthChecker {
    
    private final Map<String, DataCenterHealth> healthStatus = new ConcurrentHashMap<>();
    private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2);
    
    @PostConstruct
    public void init() {
        // 启动定期健康检查
        scheduler.scheduleWithFixedDelay(this::checkAllDataCenters, 0, 30, TimeUnit.SECONDS);
    }
    
    public boolean isHealthy(String dataCenterId) {
        DataCenterHealth health = healthStatus.get(dataCenterId);
        return health != null && health.isHealthy();
    }
    
    public double getCpuUsage(String dataCenterId) {
        DataCenterHealth health = healthStatus.get(dataCenterId);
        return health != null ? health.getCpuUsage() : 100.0;
    }
    
    public double getMemoryUsage(String dataCenterId) {
        DataCenterHealth health = healthStatus.get(dataCenterId);
        return health != null ? health.getMemoryUsage() : 100.0;
    }
    
    public double getRemainingCapacity(String dataCenterId) {
        DataCenterHealth health = healthStatus.get(dataCenterId);
        return health != null ? health.getRemainingCapacity() : 0.0;
    }
    
    private void checkAllDataCenters() {
        // 检查所有已知的数据中心
        for (String dataCenterId : getAllDataCenterIds()) {
            try {
                DataCenterHealth health = checkDataCenter(dataCenterId);
                healthStatus.put(dataCenterId, health);
            } catch (Exception e) {
                log.error("数据中心 {} 健康检查失败", dataCenterId, e);
                healthStatus.put(dataCenterId, DataCenterHealth.unhealthy(dataCenterId, e.getMessage()));
            }
        }
    }
    
    private DataCenterHealth checkDataCenter(String dataCenterId) {
        // 检查数据中心的各项指标
        boolean databaseHealthy = checkDatabaseHealth(dataCenterId);
        boolean networkHealthy = checkNetworkHealth(dataCenterId);
        boolean servicesHealthy = checkServicesHealth(dataCenterId);
        
        double cpuUsage = getDataCenterCpuUsage(dataCenterId);
        double memoryUsage = getDataCenterMemoryUsage(dataCenterId);
        double remainingCapacity = calculateRemainingCapacity(dataCenterId);
        
        boolean isHealthy = databaseHealthy && networkHealthy && servicesHealthy;
        
        return DataCenterHealth.builder()
                .dataCenterId(dataCenterId)
                .healthy(isHealthy)
                .cpuUsage(cpuUsage)
                .memoryUsage(memoryUsage)
                .remainingCapacity(remainingCapacity)
                .lastCheckTime(System.currentTimeMillis())
                .build();
    }
}

高可用架构最佳实践

1. 监控告警体系

# Prometheus高可用监控配置
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - "ha_alerts.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager1:9093
          - alertmanager2:9093
          - alertmanager3:9093

scrape_configs:
  # 应用健康检查
  - job_name: 'application-health'
    metrics_path: '/actuator/health'
    static_configs:
      - targets: ['app1:8080', 'app2:8080', 'app3:8080']
    
  # 数据库监控
  - job_name: 'database-health'
    static_configs:
      - targets: ['mysql-exporter1:9104', 'mysql-exporter2:9104']
    
  # Redis监控
  - job_name: 'redis-health'
    static_configs:
      - targets: ['redis-exporter1:9121', 'redis-exporter2:9121']
    
  # 负载均衡器监控
  - job_name: 'loadbalancer-health'
    static_configs:
      - targets: ['nginx-exporter1:9113', 'nginx-exporter2:9113']

---
# 高可用告警规则
groups:
- name: high_availability_alerts
  rules:
  
  # 服务不可用告警
  - alert: ServiceDown
    expr: up == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "服务不可用"
      description: "服务 {{ $labels.instance }} 已停止运行"
  
  # 数据库主从延迟告警
  - alert: MySQLReplicationLag
    expr: mysql_slave_lag_seconds > 10
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: "MySQL主从延迟过高"
      description: "MySQL主从延迟 {{ $value }} 秒"
  
  # Redis集群节点失败告警
  - alert: RedisClusterNodeFailure
    expr: redis_cluster_state != 1
    for: 1m
    labels:
      severity: critical
    annotations:
      summary: "Redis集群状态异常"
      description: "Redis集群状态不正常"
  
  # 负载均衡器后端健康检查失败告警
  - alert: LoadBalancerBackendUnhealthy
    expr: nginx_up == 0
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "负载均衡器后端不健康"
      description: "Nginx后端服务器 {{ $labels.instance }} 健康检查失败"
  
  # 数据中心间网络延迟告警
  - alert: InterDataCenterLatencyHigh
    expr: datacenter_network_latency_ms > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "数据中心间网络延迟过高"
      description: "数据中心间网络延迟 {{ $value }}ms"
  
  # 服务熔断器开启告警
  - alert: CircuitBreakerOpen
    expr: circuit_breaker_state == 1
    for: 1m
    labels:
      severity: warning
    annotations:
      summary: "服务熔断器开启"
      description: "服务 {{ $labels.service }} 熔断器已开启"
  
  # 限流触发告警
  - alert: RateLimitTriggered
    expr: rate_limit_rejected_total > 0
    for: 1m
    labels:
      severity: info
    annotations:
      summary: "限流触发"
      description: "服务 {{ $labels.service }} 触发限流，拒绝请求数 {{ $value }}"

2. 故障演练机制

// 故障演练服务
@Service
public class ChaosEngineeringService {
    
    @Autowired
    private ApplicationContext applicationContext;
    
    @Autowired
    private ExecutorService executorService;
    
    // 网络延迟故障
    public void injectNetworkLatency(String serviceName, long delayMillis) {
        log.info("注入网络延迟故障: {} - {}ms", serviceName, delayMillis);
        
        // 通过代理模式添加延迟
        NetworkLatencyInjector injector = new NetworkLatencyInjector(delayMillis);
        injector.inject(serviceName);
    }
    
    // 服务不可用故障
    public void injectServiceUnavailable(String serviceName, int errorRate, int durationSeconds) {
        log.info("注入服务不可用故障: {} - 错误率: {}% - 持续时间: {}s", serviceName, errorRate, durationSeconds);
        
        ServiceFailureSimulator simulator = new ServiceFailureSimulator(errorRate);
        CompletableFuture.runAsync(() -> {
            simulator.simulateFailure(serviceName, durationSeconds);
        }, executorService);
    }
    
    // 数据库连接池耗尽故障
    public void injectDatabaseConnectionExhaustion(String dataSourceName, int maxConnections) {
        log.info("注入数据库连接池耗尽故障: {} - 最大连接数: {}", dataSourceName, maxConnections);
        
        DataSourceConnectionExhauster exhauster = new DataSourceConnectionExhauster();
        exhauster.exhaustConnections(dataSourceName, maxConnections);
    }
    
    // 内存泄漏故障
    public void injectMemoryLeak(String serviceName, int leakRate, int durationMinutes) {
        log.info("注入内存泄漏故障: {} - 泄漏速率: {}MB/min - 持续时间: {}min", serviceName, leakRate, durationMinutes);
        
        MemoryLeakSimulator simulator = new MemoryLeakSimulator(leakRate);
        CompletableFuture.runAsync(() -> {
            simulator.simulateMemoryLeak(serviceName, durationMinutes);
        }, executorService);
    }
    
    // CPU飙高故障
    public void injectHighCPU(String serviceName, int cpuUsage, int durationSeconds) {
        log.info("注入CPU飙高故障: {} - CPU使用率: {}% - 持续时间: {}s", serviceName, cpuUsage, durationSeconds);
        
        CPUStressGenerator generator = new CPUStressGenerator(cpuUsage);
        CompletableFuture.runAsync(() -> {
            generator.generateLoad(serviceName, durationSeconds);
        }, executorService);
    }
    
    // 磁盘IO阻塞故障
    public void injectDiskIOBlock(String serviceName, int blockDurationSeconds) {
        log.info("注入磁盘IO阻塞故障: {} - 阻塞时间: {}s", serviceName, blockDurationSeconds);
        
        DiskIOBlocker blocker = new DiskIOBlocker();
        blocker.blockIO(serviceName, blockDurationSeconds);
    }
    
    // 网络分区故障
    public void injectNetworkPartition(String serviceName1, String serviceName2, int durationSeconds) {
        log.info("注入网络分区故障: {} <-> {} - 持续时间: {}s", serviceName1, serviceName2, durationSeconds);
        
        NetworkPartitionSimulator simulator = new NetworkPartitionSimulator();
        simulator.simulatePartition(serviceName1, serviceName2, durationSeconds);
    }
}

// 故障演练计划
@Component
public class ChaosExperimentPlanner {
    
    @Autowired
    private ChaosEngineeringService chaosService;
    
    @Autowired
    private MetricsCollector metricsCollector;
    
    // 定期执行故障演练
    @Scheduled(cron = "0 0 2 * * ?") // 每天凌晨2点执行
    public void runChaosExperiments() {
        log.info("开始执行故障演练计划");
        
        try {
            // 收集演练前的系统指标
            Map<String, Object> baselineMetrics = metricsCollector.collectBaselineMetrics();
            
            // 执行网络延迟演练
            runNetworkLatencyExperiment();
            
            // 执行服务不可用演练
            runServiceUnavailableExperiment();
            
            // 执行数据库连接池耗尽演练
            runDatabaseConnectionExhaustionExperiment();
            
            // 收集演练后的系统指标
            Map<String, Object> postExperimentMetrics = metricsCollector.collectPostExperimentMetrics();
            
            // 生成演练报告
            generateExperimentReport(baselineMetrics, postExperimentMetrics);
            
        } catch (Exception e) {
            log.error("故障演练执行失败", e);
        }
    }
    
    private void runNetworkLatencyExperiment() {
        log.info("执行网络延迟故障演练");
        
        // 对用户服务注入100ms网络延迟
        chaosService.injectNetworkLatency("user-service", 100);
        
        // 等待30秒观察系统表现
        sleep(30);
        
        // 移除故障
        removeNetworkLatency("user-service");
    }
    
    private void runServiceUnavailableExperiment() {
        log.info("执行服务不可用故障演练");
        
        // 对订单服务注入50%的错误率
        chaosService.injectServiceUnavailable("order-service", 50, 60);
        
        // 等待60秒观察系统表现
        sleep(60);
        
        // 故障会自动移除
    }
    
    private void runDatabaseConnectionExhaustionExperiment() {
        log.info("执行数据库连接池耗尽故障演练");
        
        // 将数据库连接池大小限制为5个
        chaosService.injectDatabaseConnectionExhaustion("primary-datasource", 5);
        
        // 等待30秒观察系统表现
        sleep(30);
        
        // 恢复正常的连接池大小
        restoreDatabaseConnectionPool("primary-datasource");
    }
}

3. 容量规划与评估

// 容量规划服务
@Service
public class CapacityPlanningService {
    
    @Autowired
    private MetricsService metricsService;
    
    @Autowired
    private PredictionService predictionService;
    
    // 评估当前系统容量
    public CapacityAssessment assessCurrentCapacity() {
        CapacityAssessment assessment = new CapacityAssessment();
        
        // 1. 收集当前系统指标
        double currentQps = metricsService.getCurrentQps();
        double cpuUsage = metricsService.getAverageCpuUsage();
        double memoryUsage = metricsService.getAverageMemoryUsage();
        double responseTime = metricsService.getAverageResponseTime();
        int activeConnections = metricsService.getActiveConnections();
        
        // 2. 计算容量利用率
        assessment.setQpsUtilization(currentQps / getMaxQps());
        assessment.setCpuUtilization(cpuUsage / 100);
        assessment.setMemoryUtilization(memoryUsage / 100);
        assessment.setResponseTimeRatio(responseTime / getTargetResponseTime());
        assessment.setConnectionUtilization(activeConnections / getMaxConnections());
        
        // 3. 识别系统瓶颈
        assessment.setBottleneck(identifyBottleneck(assessment));
        
        // 4. 评估风险等级
        assessment.setRiskLevel(calculateRiskLevel(assessment));
        
        // 5. 生成优化建议
        assessment.setRecommendations(generateRecommendations(assessment));
        
        return assessment;
    }
    
    // 预测未来容量需求
    public CapacityForecast forecastFutureCapacity(int daysAhead) {
        CapacityForecast forecast = new CapacityForecast();
        
        // 1. 获取历史数据
        List<MetricData> historicalData = metricsService.getHistoricalMetrics(30);
        
        // 2. 预测未来负载
        double predictedQps = predictionService.predictQps(historicalData, daysAhead);
        double predictedUserGrowth = predictionService.predictUserGrowth(daysAhead);
        
        // 3. 计算所需资源
        int requiredInstances = calculateRequiredInstances(predictedQps);
        ResourceRequirement resourceReq = calculateResourceRequirements(predictedQps);
        
        // 4. 评估扩容时间窗口
        LocalDateTime expansionTime = calculateExpansionTime(predictedQps);
        
        forecast.setPredictedQps(predictedQps);
        forecast.setPredictedUserGrowth(predictedUserGrowth);
        forecast.setRequiredInstances(requiredInstances);
        forecast.setResourceRequirement(resourceReq);
        forecast.setRecommendedExpansionTime(expansionTime);
        forecast.setConfidenceLevel(calculateConfidenceLevel(historicalData));
        
        return forecast;
    }
    
    // 生成容量报告
    public CapacityReport generateCapacityReport() {
        CapacityReport report = new CapacityReport();
        
        // 当前容量评估
        report.setCurrentCapacity(assessCurrentCapacity());
        
        // 未来容量预测
        report.setForecast30Days(forecastFutureCapacity(30));
        report.setForecast90Days(forecastFutureCapacity(90));
        
        // 成本分析
        report.setCostAnalysis(analyzeCosts());
        
        // 风险评估
        report.setRiskAssessment(assessRisks());
        
        // 行动建议
        report.setActionPlan(generateActionPlan(report));
        
        return report;
    }
    
    private String identifyBottleneck(CapacityAssessment assessment) {
        if (assessment.getCpuUtilization() > 0.8) {
            return "CPU利用率过高，建议增加计算资源或优化代码";
        } else if (assessment.getMemoryUtilization() > 0.8) {
            return "内存利用率过高，建议增加内存或优化内存使用";
        } else if (assessment.getQpsUtilization() > 0.8) {
            return "QPS接近上限，建议水平扩展服务实例";
        } else if (assessment.getResponseTimeRatio() > 1.5) {
            return "响应时间过长，建议优化性能或增加资源";
        } else if (assessment.getConnectionUtilization() > 0.8) {
            return "连接数接近上限，建议增加连接池或优化连接使用";
        }
        return "系统运行正常，暂无瓶颈";
    }
    
    private RiskLevel calculateRiskLevel(CapacityAssessment assessment) {
        double maxUtilization = Math.max(
            assessment.getCpuUtilization(),
            Math.max(assessment.getMemoryUtilization(),
                Math.max(assessment.getQpsUtilization(), assessment.getConnectionUtilization()))
        );
        
        if (maxUtilization > 0.9) {
            return RiskLevel.CRITICAL;
        } else if (maxUtilization > 0.8) {
            return RiskLevel.HIGH;
        } else if (maxUtilization > 0.7) {
            return RiskLevel.MEDIUM;
        } else if (maxUtilization > 0.6) {
            return RiskLevel.LOW;
        } else {
            return RiskLevel.MINIMAL;
        }
    }
}