AI驱动的持续集成与部署优化:智能化DevOps工作流

摘要

持续集成与部署(CI/CD)是现代软件开发的核心实践,传统的CI/CD流程往往依赖固定的规则和人工干预。本文将探讨如何利用AI技术优化CI/CD流程,从智能构建调度到自动化测试优化,从部署风险评估到性能监控,全面提升软件交付效率和质量。

目录

  1. AI在CI/CD中的应用概述
  2. 智能构建调度与优化
  3. 自动化测试智能化
  4. 部署风险评估与预测
  5. 性能监控与异常检测
  6. 资源配置优化
  7. 故障预测与自愈
  8. 代码质量智能分析
  9. 实战案例分析
  10. 最佳实践与总结

AI在CI/CD中的应用概述

智能CI/CD系统架构

AI核心模块
机器学习模型
预测算法
优化算法
异常检测算法
代码提交
AI构建调度器
智能测试选择
并行构建优化
资源分配优化
测试执行引擎
测试结果分析
质量评估
性能基准测试
部署风险评估
智能部署策略
回滚决策
监控配置
生产环境部署
实时监控
性能分析
异常检测
自动扩缩容

核心CI/CD框架

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from typing import Dict, List, Any, Optional, Tuple, Union
from dataclasses import dataclass, asdict, field
from abc import ABC, abstractmethod
from enum import Enum
import json
import subprocess
import time
import logging
from collections import defaultdict, deque
import warnings
warnings.filterwarnings('ignore')

class BuildStatus(Enum):
    """构建状态枚举"""
    PENDING = "pending"
    RUNNING = "running"
    SUCCESS = "success"
    FAILED = "failed"
    CANCELLED = "cancelled"
    TIMEOUT = "timeout"

class DeploymentStage(Enum):
    """部署阶段枚举"""
    DEV = "development"
    TEST = "testing"
    STAGING = "staging"
    PRODUCTION = "production"

class TestType(Enum):
    """测试类型枚举"""
    UNIT = "unit"
    INTEGRATION = "integration"
    E2E = "e2e"
    PERFORMANCE = "performance"
    SECURITY = "security"

class AlertSeverity(Enum):
    """告警严重程度枚举"""
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"

@dataclass
class BuildJob:
    """构建任务数据结构"""
    id: str
    project_name: str
    branch: str
    commit_hash: str
    author: str
    timestamp: datetime
    status: BuildStatus
    duration: float
    test_results: Dict[TestType, Dict[str, Any]]
    artifacts: List[str]
    dependencies: List[str]
    resource_usage: Dict[str, float]
    failure_reason: Optional[str]
    retry_count: int

@dataclass
class DeploymentJob:
    """部署任务数据结构"""
    id: str
    build_id: str
    stage: DeploymentStage
    version: str
    timestamp: datetime
    status: BuildStatus
    duration: float
    rollback_version: Optional[str]
    health_checks: Dict[str, bool]
    performance_metrics: Dict[str, float]
    risk_score: float
    approval_required: bool

@dataclass
class TestExecution:
    """测试执行数据结构"""
    id: str
    build_id: str
    test_type: TestType
    test_suite: str
    test_cases: List[str]
    passed: int
    failed: int
    skipped: int
    duration: float
    coverage: float
    flaky_tests: List[str]
    new_failures: List[str]

@dataclass
class SystemMetrics:
    """系统指标数据结构"""
    timestamp: datetime
    cpu_usage: float
    memory_usage: float
    disk_usage: float
    network_io: float
    response_time: float
    error_rate: float
    throughput: float
    active_users: int

@dataclass
class Alert:
    """告警数据结构"""
    id: str
    timestamp: datetime
    severity: AlertSeverity
    source: str
    message: str
    metrics: Dict[str, float]
    resolved: bool
    resolution_time: Optional[datetime]
    false_positive: bool

class IntelligentBuildScheduler:
    """智能构建调度器"""
    
    def __init__(self):
        self.build_history = []
        self.resource_pool = ResourcePool()
        self.predictor = BuildTimePredictor()
        self.optimizer = BuildOptimizer()
        self.queue = deque()
        self.running_builds = {}
    
    def schedule_build(self, build_request: Dict[str, Any]) -> str:
        """调度构建任务"""
        
        # 创建构建任务
        build_job = self._create_build_job(build_request)
        
        # 预测构建时间和资源需求
        predicted_duration = self.predictor.predict_build_time(build_job)
        resource_requirements = self.predictor.predict_resource_needs(build_job)
        
        # 计算优先级
        priority = self._calculate_build_priority(build_job, build_request)
        
        # 优化构建配置
        optimized_config = self.optimizer.optimize_build_config(
            build_job, resource_requirements
        )
        
        # 添加到调度队列
        self.queue.append({
            'build_job': build_job,
            'priority': priority,
            'predicted_duration': predicted_duration,
            'resource_requirements': resource_requirements,
            'optimized_config': optimized_config
        })
        
        # 触发调度
        self._trigger_scheduling()
        
        return build_job.id
    
    def _create_build_job(self, build_request: Dict[str, Any]) -> BuildJob:
        """创建构建任务"""
        return BuildJob(
            id=f"build_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{np.random.randint(1000, 9999)}",
            project_name=build_request['project'],
            branch=build_request['branch'],
            commit_hash=build_request['commit'],
            author=build_request['author'],
            timestamp=datetime.now(),
            status=BuildStatus.PENDING,
            duration=0.0,
            test_results={},
            artifacts=[],
            dependencies=build_request.get('dependencies', []),
            resource_usage={},
            failure_reason=None,
            retry_count=0
        )
    
    def _calculate_build_priority(self, build_job: BuildJob, 
                                build_request: Dict[str, Any]) -> float:
        """计算构建优先级"""
        priority = 0.0
        
        # 分支优先级
        if build_job.branch == 'main' or build_job.branch == 'master':
            priority += 10.0
        elif build_job.branch.startswith('release/'):
            priority += 8.0
        elif build_job.branch.startswith('hotfix/'):
            priority += 9.0
        else:
            priority += 5.0
        
        # 作者优先级(基于历史成功率)
        author_success_rate = self._get_author_success_rate(build_job.author)
        priority += author_success_rate * 3.0
        
        # 项目重要性
        project_importance = build_request.get('importance', 5.0)
        priority += project_importance
        
        # 紧急程度
        if build_request.get('urgent', False):
            priority += 15.0
        
        # 依赖关系
        if build_job.dependencies:
            priority += len(build_job.dependencies) * 2.0
        
        return priority
    
    def _trigger_scheduling(self):
        """触发调度逻辑"""
        # 按优先级排序队列
        self.queue = deque(sorted(self.queue, key=lambda x: x['priority'], reverse=True))
        
        # 尝试启动构建
        while self.queue and self._has_available_resources():
            build_item = self.queue.popleft()
            self._start_build(build_item)
    
    def _has_available_resources(self) -> bool:
        """检查是否有可用资源"""
        return self.resource_pool.get_available_capacity() > 0.2
    
    def _start_build(self, build_item: Dict[str, Any]):
        """启动构建"""
        build_job = build_item['build_job']
        
        # 分配资源
        allocated_resources = self.resource_pool.allocate_resources(
            build_item['resource_requirements']
        )
        
        if allocated_resources:
            build_job.status = BuildStatus.RUNNING
            self.running_builds[build_job.id] = {
                'build_job': build_job,
                'start_time': datetime.now(),
                'allocated_resources': allocated_resources,
                'config': build_item['optimized_config']
            }
            
            # 异步执行构建
            self._execute_build_async(build_job, build_item['optimized_config'])
    
    def _execute_build_async(self, build_job: BuildJob, config: Dict[str, Any]):
        """异步执行构建"""
        # 在实际应用中,这里会启动真实的构建进程
        # 这里使用模拟实现
        import threading
        
        def build_worker():
            try:
                # 模拟构建过程
                build_duration = np.random.normal(300, 60)  # 平均5分钟,标准差1分钟
                time.sleep(min(build_duration / 60, 10))  # 最多等待10秒(模拟)
                
                # 模拟构建结果
                success_probability = 0.8  # 80%成功率
                if np.random.random() < success_probability:
                    build_job.status = BuildStatus.SUCCESS
                else:
                    build_job.status = BuildStatus.FAILED
                    build_job.failure_reason = "Build compilation failed"
                
                build_job.duration = build_duration
                
                # 完成构建
                self._complete_build(build_job)
                
            except Exception as e:
                build_job.status = BuildStatus.FAILED
                build_job.failure_reason = str(e)
                self._complete_build(build_job)
        
        thread = threading.Thread(target=build_worker)
        thread.daemon = True
        thread.start()
    
    def _complete_build(self, build_job: BuildJob):
        """完成构建"""
        if build_job.id in self.running_builds:
            build_info = self.running_builds[build_job.id]
            
            # 释放资源
            self.resource_pool.release_resources(build_info['allocated_resources'])
            
            # 记录构建历史
            self.build_history.append(build_job)
            
            # 从运行中构建移除
            del self.running_builds[build_job.id]
            
            # 触发后续处理
            self._post_build_processing(build_job)
            
            # 继续调度队列中的构建
            self._trigger_scheduling()
    
    def _post_build_processing(self, build_job: BuildJob):
        """构建后处理"""
        # 更新预测模型
        self.predictor.update_model(build_job)
        
        # 如果构建失败且可以重试
        if (build_job.status == BuildStatus.FAILED and 
            build_job.retry_count < 3 and 
            self._should_retry(build_job)):
            
            # 重新调度
            retry_request = {
                'project': build_job.project_name,
                'branch': build_job.branch,
                'commit': build_job.commit_hash,
                'author': build_job.author,
                'retry': True
            }
            self.schedule_build(retry_request)
    
    def _should_retry(self, build_job: BuildJob) -> bool:
        """判断是否应该重试"""
        # 简单的重试逻辑
        transient_failures = [
            "network timeout",
            "resource unavailable",
            "temporary service failure"
        ]
        
        if build_job.failure_reason:
            return any(failure in build_job.failure_reason.lower() 
                      for failure in transient_failures)
        
        return False
    
    def _get_author_success_rate(self, author: str) -> float:
        """获取作者的历史成功率"""
        author_builds = [build for build in self.build_history 
                        if build.author == author]
        
        if not author_builds:
            return 0.5  # 默认成功率
        
        successful_builds = [build for build in author_builds 
                           if build.status == BuildStatus.SUCCESS]
        
        return len(successful_builds) / len(author_builds)

class BuildTimePredictor:
    """构建时间预测器"""
    
    def __init__(self):
        self.historical_data = []
        self.model = None
        self.feature_extractors = self._initialize_feature_extractors()
    
    def predict_build_time(self, build_job: BuildJob) -> float:
        """预测构建时间"""
        if not self.historical_data:
            return 300.0  # 默认5分钟
        
        # 提取特征
        features = self._extract_features(build_job)
        
        # 使用简单的基于历史数据的预测
        similar_builds = self._find_similar_builds(build_job)
        
        if similar_builds:
            durations = [build.duration for build in similar_builds]
            predicted_time = np.mean(durations)
            
            # 考虑项目复杂度调整
            complexity_factor = self._calculate_complexity_factor(build_job)
            predicted_time *= complexity_factor
            
            return max(60.0, predicted_time)  # 最少1分钟
        
        return 300.0  # 默认值
    
    def predict_resource_needs(self, build_job: BuildJob) -> Dict[str, float]:
        """预测资源需求"""
        base_requirements = {
            'cpu': 1.0,
            'memory': 2.0,  # GB
            'disk': 5.0,    # GB
            'network': 0.1  # Mbps
        }
        
        # 基于项目类型调整
        project_type = self._infer_project_type(build_job.project_name)
        
        if project_type == 'java':
            base_requirements['memory'] *= 2.0
            base_requirements['cpu'] *= 1.5
        elif project_type == 'node':
            base_requirements['disk'] *= 1.5
            base_requirements['network'] *= 2.0
        elif project_type == 'python':
            base_requirements['cpu'] *= 1.2
        
        # 基于依赖数量调整
        dependency_factor = 1 + len(build_job.dependencies) * 0.1
        for resource in base_requirements:
            base_requirements[resource] *= dependency_factor
        
        return base_requirements
    
    def update_model(self, completed_build: BuildJob):
        """更新预测模型"""
        self.historical_data.append(completed_build)
        
        # 保持最近1000个构建记录
        if len(self.historical_data) > 1000:
            self.historical_data = self.historical_data[-1000:]
        
        # 重新训练模型(简化实现)
        if len(self.historical_data) >= 50:
            self._retrain_model()
    
    def _extract_features(self, build_job: BuildJob) -> Dict[str, float]:
        """提取构建特征"""
        features = {
            'project_hash': hash(build_job.project_name) % 1000,
            'branch_type': self._encode_branch_type(build_job.branch),
            'dependency_count': len(build_job.dependencies),
            'hour_of_day': build_job.timestamp.hour,
            'day_of_week': build_job.timestamp.weekday(),
            'author_experience': self._get_author_experience(build_job.author)
        }
        
        return features
    
    def _find_similar_builds(self, build_job: BuildJob) -> List[BuildJob]:
        """找到相似的构建"""
        similar_builds = []
        
        for historical_build in self.historical_data:
            similarity_score = self._calculate_similarity(build_job, historical_build)
            if similarity_score > 0.7:  # 相似度阈值
                similar_builds.append(historical_build)
        
        # 返回最相似的10个构建
        similar_builds.sort(key=lambda b: self._calculate_similarity(build_job, b), reverse=True)
        return similar_builds[:10]
    
    def _calculate_similarity(self, build1: BuildJob, build2: BuildJob) -> float:
        """计算构建相似度"""
        similarity = 0.0
        
        # 项目相同
        if build1.project_name == build2.project_name:
            similarity += 0.4
        
        # 分支类型相同
        if self._get_branch_type(build1.branch) == self._get_branch_type(build2.branch):
            similarity += 0.2
        
        # 依赖数量相似
        dep_diff = abs(len(build1.dependencies) - len(build2.dependencies))
        dep_similarity = max(0, 1 - dep_diff / 10)
        similarity += dep_similarity * 0.2
        
        # 作者相同
        if build1.author == build2.author:
            similarity += 0.1
        
        # 时间相近(同一天内)
        time_diff = abs((build1.timestamp - build2.timestamp).days)
        time_similarity = max(0, 1 - time_diff / 30)
        similarity += time_similarity * 0.1
        
        return similarity
    
    def _calculate_complexity_factor(self, build_job: BuildJob) -> float:
        """计算复杂度因子"""
        factor = 1.0
        
        # 基于分支类型
        if build_job.branch.startswith('feature/'):
            factor *= 1.2  # 功能分支可能更复杂
        elif build_job.branch.startswith('hotfix/'):
            factor *= 0.8  # 热修复通常较简单
        
        # 基于依赖数量
        factor *= (1 + len(build_job.dependencies) * 0.05)
        
        return factor
    
    def _infer_project_type(self, project_name: str) -> str:
        """推断项目类型"""
        if 'java' in project_name.lower() or 'spring' in project_name.lower():
            return 'java'
        elif 'node' in project_name.lower() or 'js' in project_name.lower():
            return 'node'
        elif 'python' in project_name.lower() or 'py' in project_name.lower():
            return 'python'
        else:
            return 'unknown'
    
    def _encode_branch_type(self, branch: str) -> float:
        """编码分支类型"""
        if branch in ['main', 'master']:
            return 1.0
        elif branch.startswith('release/'):
            return 2.0
        elif branch.startswith('feature/'):
            return 3.0
        elif branch.startswith('hotfix/'):
            return 4.0
        else:
            return 5.0
    
    def _get_branch_type(self, branch: str) -> str:
        """获取分支类型"""
        if branch in ['main', 'master']:
            return 'main'
        elif branch.startswith('release/'):
            return 'release'
        elif branch.startswith('feature/'):
            return 'feature'
        elif branch.startswith('hotfix/'):
            return 'hotfix'
        else:
            return 'other'
    
    def _get_author_experience(self, author: str) -> float:
        """获取作者经验值"""
        author_builds = [build for build in self.historical_data 
                        if build.author == author]
        return min(len(author_builds) / 100.0, 1.0)  # 归一化到0-1
    
    def _initialize_feature_extractors(self) -> Dict[str, Any]:
        """初始化特征提取器"""
        return {
            'project_encoder': {},
            'branch_encoder': {},
            'author_encoder': {}
        }
    
    def _retrain_model(self):
        """重新训练模型"""
        # 在实际应用中,这里会使用机器学习算法训练模型
        # 这里使用简化实现
        pass

class BuildOptimizer:
    """构建优化器"""
    
    def optimize_build_config(self, build_job: BuildJob, 
                            resource_requirements: Dict[str, float]) -> Dict[str, Any]:
        """优化构建配置"""
        
        config = {
            'parallel_jobs': self._calculate_optimal_parallelism(resource_requirements),
            'cache_strategy': self._select_cache_strategy(build_job),
            'test_selection': self._optimize_test_selection(build_job),
            'resource_limits': self._set_resource_limits(resource_requirements),
            'build_tools': self._optimize_build_tools(build_job)
        }
        
        return config
    
    def _calculate_optimal_parallelism(self, resource_requirements: Dict[str, float]) -> int:
        """计算最优并行度"""
        cpu_cores = resource_requirements.get('cpu', 1.0)
        
        # 基于CPU核心数确定并行度
        if cpu_cores >= 8:
            return 8
        elif cpu_cores >= 4:
            return 4
        elif cpu_cores >= 2:
            return 2
        else:
            return 1
    
    def _select_cache_strategy(self, build_job: BuildJob) -> Dict[str, Any]:
        """选择缓存策略"""
        strategy = {
            'dependency_cache': True,
            'build_cache': True,
            'test_cache': True,
            'docker_layer_cache': False
        }
        
        # 基于项目类型调整
        project_type = self._infer_project_type(build_job.project_name)
        
        if project_type == 'java':
            strategy['maven_cache'] = True
            strategy['gradle_cache'] = True
        elif project_type == 'node':
            strategy['npm_cache'] = True
            strategy['node_modules_cache'] = True
        elif project_type == 'python':
            strategy['pip_cache'] = True
            strategy['virtualenv_cache'] = True
        
        # 如果有Docker相关文件,启用Docker缓存
        if 'docker' in build_job.project_name.lower():
            strategy['docker_layer_cache'] = True
        
        return strategy
    
    def _optimize_test_selection(self, build_job: BuildJob) -> Dict[str, Any]:
        """优化测试选择"""
        selection = {
            'run_all_tests': False,
            'smart_test_selection': True,
            'parallel_test_execution': True,
            'test_prioritization': True
        }
        
        # 基于分支类型调整
        if build_job.branch in ['main', 'master']:
            selection['run_all_tests'] = True
        elif build_job.branch.startswith('release/'):
            selection['run_all_tests'] = True
            selection['include_performance_tests'] = True
        elif build_job.branch.startswith('hotfix/'):
            selection['focus_on_regression_tests'] = True
        
        return selection
    
    def _set_resource_limits(self, resource_requirements: Dict[str, float]) -> Dict[str, Any]:
        """设置资源限制"""
        limits = {
            'cpu_limit': resource_requirements.get('cpu', 1.0) * 1.2,  # 20%缓冲
            'memory_limit': f"{resource_requirements.get('memory', 2.0) * 1.1}G",  # 10%缓冲
            'disk_limit': f"{resource_requirements.get('disk', 5.0) * 1.1}G",
            'timeout': '30m'  # 默认30分钟超时
        }
        
        return limits
    
    def _optimize_build_tools(self, build_job: BuildJob) -> Dict[str, Any]:
        """优化构建工具配置"""
        tools = {
            'compiler_optimization': True,
            'incremental_build': True,
            'build_tool_version': 'latest_stable'
        }
        
        project_type = self._infer_project_type(build_job.project_name)
        
        if project_type == 'java':
            tools.update({
                'maven_opts': '-Xmx2g -XX:+UseG1GC',
                'gradle_opts': '--parallel --daemon'
            })
        elif project_type == 'node':
            tools.update({
                'node_version': 'lts',
                'npm_config': '--prefer-offline --no-audit'
            })
        elif project_type == 'python':
            tools.update({
                'python_version': '3.9',
                'pip_config': '--cache-dir /tmp/pip-cache'
            })
        
        return tools
    
    def _infer_project_type(self, project_name: str) -> str:
        """推断项目类型"""
        if 'java' in project_name.lower() or 'spring' in project_name.lower():
            return 'java'
        elif 'node' in project_name.lower() or 'js' in project_name.lower():
            return 'node'
        elif 'python' in project_name.lower() or 'py' in project_name.lower():
            return 'python'
        else:
            return 'unknown'

class ResourcePool:
    """资源池管理器"""
    
    def __init__(self):
        self.total_capacity = {
            'cpu': 16.0,      # 16核CPU
            'memory': 64.0,   # 64GB内存
            'disk': 1000.0,   # 1TB磁盘
            'network': 10.0   # 10Gbps网络
        }
        self.allocated_resources = {
            'cpu': 0.0,
            'memory': 0.0,
            'disk': 0.0,
            'network': 0.0
        }
        self.allocations = {}  # 分配记录
    
    def get_available_capacity(self) -> float:
        """获取可用容量比例"""
        cpu_available = (self.total_capacity['cpu'] - self.allocated_resources['cpu']) / self.total_capacity['cpu']
        memory_available = (self.total_capacity['memory'] - self.allocated_resources['memory']) / self.total_capacity['memory']
        
        return min(cpu_available, memory_available)
    
    def allocate_resources(self, requirements: Dict[str, float]) -> Optional[str]:
        """分配资源"""
        # 检查是否有足够资源
        for resource, required in requirements.items():
            available = self.total_capacity[resource] - self.allocated_resources[resource]
            if available < required:
                return None  # 资源不足
        
        # 分配资源
        allocation_id = f"alloc_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{np.random.randint(1000, 9999)}"
        
        for resource, required in requirements.items():
            self.allocated_resources[resource] += required
        
        self.allocations[allocation_id] = requirements.copy()
        
        return allocation_id
    
    def release_resources(self, allocation_id: str):
        """释放资源"""
        if allocation_id in self.allocations:
            allocation = self.allocations[allocation_id]
            
            for resource, amount in allocation.items():
                self.allocated_resources[resource] -= amount
                # 确保不会出现负值
                self.allocated_resources[resource] = max(0, self.allocated_resources[resource])
            
            del self.allocations[allocation_id]
    
    def get_resource_utilization(self) -> Dict[str, float]:
        """获取资源利用率"""
        utilization = {}
        
        for resource in self.total_capacity:
            utilization[resource] = self.allocated_resources[resource] / self.total_capacity[resource]
        
        return utilization

class IntelligentTestSelector:
    """智能测试选择器"""
    
    def __init__(self):
        self.test_history = []
        self.test_impact_analysis = TestImpactAnalyzer()
        self.flaky_test_detector = FlakyTestDetector()
    
    def select_tests(self, build_job: BuildJob, 
                    changed_files: List[str]) -> Dict[TestType, List[str]]:
        """选择要执行的测试"""
        
        selected_tests = {
            TestType.UNIT: [],
            TestType.INTEGRATION: [],
            TestType.E2E: [],
            TestType.PERFORMANCE: [],
            TestType.SECURITY: []
        }
        
        # 基于代码变更影响分析选择测试
        impacted_tests = self.test_impact_analysis.analyze_impact(changed_files)
        
        # 添加受影响的测试
        for test_type, tests in impacted_tests.items():
            if test_type in selected_tests:
                selected_tests[test_type].extend(tests)
        
        # 添加关键路径测试
        critical_tests = self._get_critical_tests(build_job)
        for test_type, tests in critical_tests.items():
            if test_type in selected_tests:
                selected_tests[test_type].extend(tests)
        
        # 移除不稳定的测试
        stable_tests = self._filter_stable_tests(selected_tests)
        
        # 基于分支策略调整
        final_tests = self._apply_branch_strategy(build_job, stable_tests)
        
        return final_tests
    
    def _get_critical_tests(self, build_job: BuildJob) -> Dict[TestType, List[str]]:
        """获取关键测试"""
        critical_tests = {
            TestType.UNIT: ['core_functionality_test', 'api_contract_test'],
            TestType.INTEGRATION: ['database_integration_test', 'service_integration_test'],
            TestType.E2E: [],
            TestType.PERFORMANCE: [],
            TestType.SECURITY: []
        }
        
        # 基于分支类型添加测试
        if build_job.branch in ['main', 'master']:
            critical_tests[TestType.E2E].extend(['smoke_test', 'critical_user_journey'])
            critical_tests[TestType.SECURITY].extend(['security_scan', 'vulnerability_test'])
        elif build_job.branch.startswith('release/'):
            critical_tests[TestType.PERFORMANCE].extend(['load_test', 'stress_test'])
            critical_tests[TestType.E2E].extend(['full_regression_test'])
        
        return critical_tests
    
    def _filter_stable_tests(self, selected_tests: Dict[TestType, List[str]]) -> Dict[TestType, List[str]]:
        """过滤稳定的测试"""
        stable_tests = {}
        
        for test_type, tests in selected_tests.items():
            stable_tests[test_type] = []
            
            for test in tests:
                if not self.flaky_test_detector.is_flaky(test):
                    stable_tests[test_type].append(test)
        
        return stable_tests
    
    def _apply_branch_strategy(self, build_job: BuildJob, 
                             selected_tests: Dict[TestType, List[str]]) -> Dict[TestType, List[str]]:
        """应用分支策略"""
        if build_job.branch.startswith('feature/'):
            # 功能分支:重点关注单元测试和集成测试
            return {
                TestType.UNIT: selected_tests[TestType.UNIT],
                TestType.INTEGRATION: selected_tests[TestType.INTEGRATION],
                TestType.E2E: selected_tests[TestType.E2E][:2],  # 只运行前2个E2E测试
                TestType.PERFORMANCE: [],
                TestType.SECURITY: []
            }
        elif build_job.branch.startswith('hotfix/'):
            # 热修复分支:快速验证
            return {
                TestType.UNIT: selected_tests[TestType.UNIT][:10],  # 限制单元测试数量
                TestType.INTEGRATION: selected_tests[TestType.INTEGRATION][:5],
                TestType.E2E: ['smoke_test'],  # 只运行冒烟测试
                TestType.PERFORMANCE: [],
                TestType.SECURITY: []
            }
        else:
            # 其他分支:运行所有选中的测试
            return selected_tests

class TestImpactAnalyzer:
    """测试影响分析器"""
    
    def __init__(self):
        self.code_test_mapping = self._load_code_test_mapping()
    
    def analyze_impact(self, changed_files: List[str]) -> Dict[TestType, List[str]]:
        """分析代码变更对测试的影响"""
        impacted_tests = {
            TestType.UNIT: [],
            TestType.INTEGRATION: [],
            TestType.E2E: [],
            TestType.PERFORMANCE: [],
            TestType.SECURITY: []
        }
        
        for file_path in changed_files:
            # 基于文件路径和内容分析影响
            file_impacts = self._analyze_file_impact(file_path)
            
            for test_type, tests in file_impacts.items():
                if test_type in impacted_tests:
                    impacted_tests[test_type].extend(tests)
        
        # 去重
        for test_type in impacted_tests:
            impacted_tests[test_type] = list(set(impacted_tests[test_type]))
        
        return impacted_tests
    
    def _analyze_file_impact(self, file_path: str) -> Dict[TestType, List[str]]:
        """分析单个文件的影响"""
        impacts = {
            TestType.UNIT: [],
            TestType.INTEGRATION: [],
            TestType.E2E: [],
            TestType.PERFORMANCE: [],
            TestType.SECURITY: []
        }
        
        # 基于文件路径模式匹配
        if '/controller/' in file_path or '/api/' in file_path:
            impacts[TestType.UNIT].extend(['controller_test', 'api_test'])
            impacts[TestType.INTEGRATION].extend(['api_integration_test'])
            impacts[TestType.E2E].extend(['api_e2e_test'])
        
        if '/service/' in file_path:
            impacts[TestType.UNIT].extend(['service_test'])
            impacts[TestType.INTEGRATION].extend(['service_integration_test'])
        
        if '/model/' in file_path or '/entity/' in file_path:
            impacts[TestType.UNIT].extend(['model_test'])
            impacts[TestType.INTEGRATION].extend(['database_test'])
        
        if '/security/' in file_path or 'auth' in file_path.lower():
            impacts[TestType.SECURITY].extend(['security_test', 'auth_test'])
        
        if '/performance/' in file_path or 'cache' in file_path.lower():
            impacts[TestType.PERFORMANCE].extend(['performance_test'])
        
        # 基于文件扩展名
        if file_path.endswith('.sql'):
            impacts[TestType.INTEGRATION].extend(['database_migration_test'])
        
        if file_path.endswith('.yml') or file_path.endswith('.yaml'):
            impacts[TestType.INTEGRATION].extend(['config_test'])
        
        return impacts
    
    def _load_code_test_mapping(self) -> Dict[str, List[str]]:
        """加载代码-测试映射关系"""
        # 在实际应用中,这里会从配置文件或数据库加载映射关系
        return {
            'src/main/java/com/example/controller/': ['controller_test'],
            'src/main/java/com/example/service/': ['service_test'],
            'src/main/java/com/example/model/': ['model_test'],
            'src/main/resources/': ['config_test']
        }

class FlakyTestDetector:
    """不稳定测试检测器"""
    
    def __init__(self):
        self.test_execution_history = []
        self.flaky_test_cache = {}
        self.stability_threshold = 0.8  # 稳定性阈值
    
    def is_flaky(self, test_name: str) -> bool:
        """判断测试是否不稳定"""
        if test_name in self.flaky_test_cache:
            return self.flaky_test_cache[test_name]
        
        # 分析测试历史
        test_executions = [execution for execution in self.test_execution_history 
                          if test_name in execution.test_cases]
        
        if len(test_executions) < 10:  # 数据不足
            return False
        
        # 计算成功率
        success_count = 0
        total_count = 0
        
        for execution in test_executions:
            if test_name in execution.test_cases:
                total_count += 1
                # 简化判断:如果测试在失败列表中,则认为失败
                if test_name not in execution.new_failures:
                    success_count += 1
        
        success_rate = success_count / total_count if total_count > 0 else 1.0
        is_flaky = success_rate < self.stability_threshold
        
        # 缓存结果
        self.flaky_test_cache[test_name] = is_flaky
        
        return is_flaky
    
    def update_test_execution(self, execution: TestExecution):
        """更新测试执行记录"""
        self.test_execution_history.append(execution)
        
        # 保持最近1000次执行记录
        if len(self.test_execution_history) > 1000:
            self.test_execution_history = self.test_execution_history[-1000:]
        
        # 清除相关测试的缓存
        for test_case in execution.test_cases:
            if test_case in self.flaky_test_cache:
                del self.flaky_test_cache[test_case]

class DeploymentRiskAssessor:
    """部署风险评估器"""
    
    def __init__(self):
        self.deployment_history = []
        self.risk_factors = self._initialize_risk_factors()
        self.ml_model = None
    
    def assess_deployment_risk(self, deployment_job: DeploymentJob, 
                             build_job: BuildJob) -> float:
        """评估部署风险"""
        
        risk_score = 0.0
        
        # 基于构建质量评估风险
        build_risk = self._assess_build_risk(build_job)
        risk_score += build_risk * 0.3
        
        # 基于测试覆盖率评估风险
        test_risk = self._assess_test_risk(build_job)
        risk_score += test_risk * 0.2
        
        # 基于代码变更评估风险
        change_risk = self._assess_change_risk(build_job)
        risk_score += change_risk * 0.2
        
        # 基于历史部署评估风险
        historical_risk = self._assess_historical_risk(deployment_job)
        risk_score += historical_risk * 0.15
        
        # 基于环境因素评估风险
        environment_risk = self._assess_environment_risk(deployment_job)
        risk_score += environment_risk * 0.15
        
        return min(1.0, risk_score)
    
    def _assess_build_risk(self, build_job: BuildJob) -> float:
        """评估构建风险"""
        risk = 0.0
        
        # 构建状态
        if build_job.status != BuildStatus.SUCCESS:
            risk += 0.8
        
        # 重试次数
        risk += build_job.retry_count * 0.1
        
        # 构建时间异常
        if self.deployment_history:
            avg_duration = np.mean([d.duration for d in self.deployment_history 
                                  if hasattr(d, 'duration')])
            if build_job.duration > avg_duration * 1.5:
                risk += 0.2
        
        return min(1.0, risk)
    
    def _assess_test_risk(self, build_job: BuildJob) -> float:
        """评估测试风险"""
        risk = 0.0
        
        if not build_job.test_results:
            return 0.8  # 没有测试结果,高风险
        
        total_tests = 0
        failed_tests = 0
        
        for test_type, results in build_job.test_results.items():
            test_passed = results.get('passed', 0)
            test_failed = results.get('failed', 0)
            
            total_tests += test_passed + test_failed
            failed_tests += test_failed
        
        if total_tests > 0:
            failure_rate = failed_tests / total_tests
            risk += failure_rate * 0.8
        
        # 测试覆盖率
        coverage = build_job.test_results.get(TestType.UNIT, {}).get('coverage', 0)
        if coverage < 0.7:  # 覆盖率低于70%
            risk += (0.7 - coverage) * 0.5
        
        return min(1.0, risk)
    
    def _assess_change_risk(self, build_job: BuildJob) -> float:
        """评估变更风险"""
        risk = 0.0
        
        # 分支类型风险
        if build_job.branch.startswith('hotfix/'):
            risk += 0.3  # 热修复有一定风险
        elif build_job.branch.startswith('feature/'):
            risk += 0.4  # 新功能风险较高
        elif build_job.branch in ['main', 'master']:
            risk += 0.1  # 主分支风险较低
        
        # 作者经验
        author_success_rate = self._get_author_success_rate(build_job.author)
        risk += (1 - author_success_rate) * 0.3
        
        return min(1.0, risk)
    
    def _assess_historical_risk(self, deployment_job: DeploymentJob) -> float:
        """评估历史风险"""
        risk = 0.0
        
        # 获取相同环境的历史部署
        same_stage_deployments = [d for d in self.deployment_history 
                                if d.stage == deployment_job.stage]
        
        if same_stage_deployments:
            # 计算最近10次部署的成功率
            recent_deployments = same_stage_deployments[-10:]
            failed_deployments = [d for d in recent_deployments 
                                if d.status == BuildStatus.FAILED]
            
            failure_rate = len(failed_deployments) / len(recent_deployments)
            risk += failure_rate * 0.6
        
        return min(1.0, risk)
    
    def _assess_environment_risk(self, deployment_job: DeploymentJob) -> float:
        """评估环境风险"""
        risk = 0.0
        
        # 部署阶段风险
        stage_risks = {
            DeploymentStage.DEV: 0.1,
            DeploymentStage.TEST: 0.2,
            DeploymentStage.STAGING: 0.3,
            DeploymentStage.PRODUCTION: 0.5
        }
        
        risk += stage_risks.get(deployment_job.stage, 0.3)
        
        # 部署时间风险(工作时间外部署风险较高)
        deploy_hour = deployment_job.timestamp.hour
        if deploy_hour < 9 or deploy_hour > 17:  # 非工作时间
            risk += 0.2
        
        # 周末部署风险
        if deployment_job.timestamp.weekday() >= 5:  # 周末
            risk += 0.3
        
        return min(1.0, risk)
    
    def _get_author_success_rate(self, author: str) -> float:
        """获取作者的历史成功率"""
        # 简化实现
        return 0.8  # 默认80%成功率
    
    def _initialize_risk_factors(self) -> Dict[str, float]:
        """初始化风险因子"""
        return {
            'build_failure': 0.8,
            'test_failure': 0.6,
            'low_coverage': 0.4,
            'new_author': 0.3,
            'weekend_deployment': 0.3,
            'production_deployment': 0.5
        }

# 使用示例和演示
def demo_intelligent_cicd_system():
    """演示智能CI/CD系统"""
    
    print("🚀 AI驱动的持续集成与部署优化系统演示")
    print("=" * 60)
    
    # 1. 初始化系统组件
    print("\n1. 初始化系统组件")
    print("-" * 30)
    
    scheduler = IntelligentBuildScheduler()
    test_selector = IntelligentTestSelector()
    risk_assessor = DeploymentRiskAssessor()
    
    print("✓ 智能构建调度器已初始化")
    print("✓ 智能测试选择器已初始化")
    print("✓ 部署风险评估器已初始化")
    
    # 2. 模拟构建请求
    print("\n2. 处理构建请求")
    print("-" * 30)
    
    build_requests = [
        {
            'project': 'user-service',
            'branch': 'feature/user-authentication',
            'commit': 'abc123def456',
            'author': 'alice@example.com',
            'dependencies': ['auth-lib', 'database-connector'],
            'importance': 7.0
        },
        {
            'project': 'payment-service',
            'branch': 'main',
            'commit': 'def456ghi789',
            'author': 'bob@example.com',
            'dependencies': ['payment-gateway', 'encryption-lib'],
            'importance': 9.0,
            'urgent': True
        },
        {
            'project': 'notification-service',
            'branch': 'hotfix/email-bug',
            'commit': 'ghi789jkl012',
            'author': 'charlie@example.com',
            'dependencies': ['email-client'],
            'importance': 8.0
        }
    ]
    
    build_ids = []
    for i, request in enumerate(build_requests):
        build_id = scheduler.schedule_build(request)
        build_ids.append(build_id)
        print(f"✓ 构建请求 {i+1} 已调度,构建ID: {build_id}")
    
    # 3. 等待构建完成并显示结果
    print("\n3. 构建执行状态")
    print("-" * 30)
    
    # 等待一段时间让构建完成
    time.sleep(2)
    
    print("构建历史:")
    for build in scheduler.build_history:
        status_icon = "✅" if build.status == BuildStatus.SUCCESS else "❌"
        print(f"  {status_icon} {build.project_name} ({build.branch}): {build.status.value}")
        print(f"     持续时间: {build.duration:.1f}秒")
        if build.failure_reason:
            print(f"     失败原因: {build.failure_reason}")
    
    # 4. 智能测试选择演示
    print("\n4. 智能测试选择")
    print("-" * 30)
    
    # 模拟代码变更
    changed_files = [
        'src/main/java/com/example/controller/UserController.java',
        'src/main/java/com/example/service/AuthService.java',
        'src/main/resources/application.yml'
    ]
    
    if scheduler.build_history:
        sample_build = scheduler.build_history[0]
        selected_tests = test_selector.select_tests(sample_build, changed_files)
        
        print("基于代码变更选择的测试:")
        for test_type, tests in selected_tests.items():
            if tests:
                print(f"  {test_type.value}: {len(tests)} 个测试")
                for test in tests[:3]:  # 显示前3个
                    print(f"    - {test}")
                if len(tests) > 3:
                    print(f"    ... 还有 {len(tests) - 3} 个测试")
    
    # 5. 部署风险评估演示
    print("\n5. 部署风险评估")
    print("-" * 30)
    
    if scheduler.build_history:
        for build in scheduler.build_history:
            if build.status == BuildStatus.SUCCESS:
                # 创建部署任务
                deployment = DeploymentJob(
                    id=f"deploy_{build.id}",
                    build_id=build.id,
                    stage=DeploymentStage.STAGING,
                    version="1.2.3",
                    timestamp=datetime.now(),
                    status=BuildStatus.PENDING,
                    duration=0.0,
                    rollback_version="1.2.2",
                    health_checks={},
                    performance_metrics={},
                    risk_score=0.0,
                    approval_required=False
                )
                
                # 评估风险
                risk_score = risk_assessor.assess_deployment_risk(deployment, build)
                deployment.risk_score = risk_score
                
                risk_level = "低" if risk_score < 0.3 else "中" if risk_score < 0.7 else "高"
                risk_icon = "🟢" if risk_score < 0.3 else "🟡" if risk_score < 0.7 else "🔴"
                
                print(f"{risk_icon} {build.project_name} 部署风险: {risk_level} ({risk_score:.2f})")
                
                if risk_score > 0.5:
                    print("  建议: 需要额外审批或在低峰时段部署")
                elif risk_score > 0.3:
                    print("  建议: 加强监控,准备快速回滚")
                else:
                    print("  建议: 可以正常部署")
    
    # 6. 资源利用率监控
    print("\n6. 资源利用率监控")
    print("-" * 30)
    
    utilization = scheduler.resource_pool.get_resource_utilization()
    print("当前资源利用率:")
    for resource, usage in utilization.items():
        usage_percent = usage * 100
        bar_length = int(usage * 20)
        bar = "█" * bar_length + "░" * (20 - bar_length)
        print(f"  {resource.upper()}: {bar} {usage_percent:.1f}%")
    
    # 7. 生成CI/CD报告
    print("\n7. 生成CI/CD报告")
    print("-" * 30)
    
    report_generator = CICDReportGenerator()
    report = report_generator.generate_pipeline_report(
        scheduler.build_history, 
        scheduler.resource_pool.get_resource_utilization()
    )
    
    # 保存报告
    with open('cicd_pipeline_report.md', 'w', encoding='utf-8') as f:
        f.write(report)
    
    print("CI/CD流水线报告已生成: cicd_pipeline_report.md")
    
    print("\n🎯 智能CI/CD系统演示完成!")
    print("\n生成的文件:")
    print("- cicd_pipeline_report.md (CI/CD流水线报告)")

class CICDReportGenerator:
    """CI/CD报告生成器"""
    
    def generate_pipeline_report(self, build_history: List[BuildJob], 
                               resource_utilization: Dict[str, float]) -> str:
        """生成流水线报告"""
        
        report = f"""# CI/CD流水线分析报告

## 报告概述

### 基本信息
- **报告生成时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- **分析周期**: 最近构建记录
- **构建总数**: {len(build_history)}

## 构建统计

### 构建成功率
"""
        
        if build_history:
            successful_builds = [b for b in build_history if b.status == BuildStatus.SUCCESS]
            success_rate = len(successful_builds) / len(build_history) * 100
            
            report += f"""
- **总构建数**: {len(build_history)}
- **成功构建**: {len(successful_builds)}
- **成功率**: {success_rate:.1f}%
"""
            
            # 按项目统计
            project_stats = {}
            for build in build_history:
                if build.project_name not in project_stats:
                    project_stats[build.project_name] = {'total': 0, 'success': 0}
                
                project_stats[build.project_name]['total'] += 1
                if build.status == BuildStatus.SUCCESS:
                    project_stats[build.project_name]['success'] += 1
            
            report += f"""
### 项目构建统计
"""
            
            for project, stats in project_stats.items():
                project_success_rate = stats['success'] / stats['total'] * 100 if stats['total'] > 0 else 0
                report += f"- **{project}**: {stats['success']}/{stats['total']} ({project_success_rate:.1f}%)\n"
            
            # 构建时间分析
            durations = [b.duration for b in build_history if b.duration > 0]
            if durations:
                avg_duration = np.mean(durations)
                median_duration = np.median(durations)
                
                report += f"""
### 构建时间分析
- **平均构建时间**: {avg_duration:.1f} 秒
- **中位数构建时间**: {median_duration:.1f} 秒
- **最长构建时间**: {max(durations):.1f} 秒
- **最短构建时间**: {min(durations):.1f} 秒
"""
        
        # 资源利用率
        report += f"""
## 资源利用率

### 当前资源使用情况
"""
        
        for resource, usage in resource_utilization.items():
            usage_percent = usage * 100
            status = "正常" if usage < 0.7 else "较高" if usage < 0.9 else "过高"
            report += f"- **{resource.upper()}**: {usage_percent:.1f}% ({status})\n"
        
        # 失败分析
        if build_history:
            failed_builds = [b for b in build_history if b.status == BuildStatus.FAILED]
            
            if failed_builds:
                report += f"""
## 失败分析

### 失败构建统计
- **失败构建数**: {len(failed_builds)}
- **失败率**: {len(failed_builds) / len(build_history) * 100:.1f}%

### 主要失败原因
"""
                
                failure_reasons = {}
                for build in failed_builds:
                    reason = build.failure_reason or "未知原因"
                    if reason not in failure_reasons:
                        failure_reasons[reason] = 0
                    failure_reasons[reason] += 1
                
                for reason, count in sorted(failure_reasons.items(), key=lambda x: x[1], reverse=True):
                    report += f"- **{reason}**: {count} 次\n"
        
        # 优化建议
        report += f"""
## 优化建议

### 性能优化
"""
        
        if build_history:
            avg_duration = np.mean([b.duration for b in build_history if b.duration > 0])
            if avg_duration > 600:  # 超过10分钟
                report += "- 构建时间较长,建议优化构建脚本和启用并行构建\n"
            
            if len([b for b in build_history if b.retry_count > 0]) > len(build_history) * 0.1:
                report += "- 重试率较高,建议分析和修复不稳定的构建步骤\n"
        
        # 资源优化建议
        high_usage_resources = [r for r, u in resource_utilization.items() if u > 0.8]
        if high_usage_resources:
            report += f"- 以下资源使用率较高,建议扩容: {', '.join(high_usage_resources)}\n"
        
        report += f"""
### 流程优化
- 建立更完善的测试策略,提高测试覆盖率
- 实施更智能的测试选择,减少不必要的测试执行
- 加强部署前的风险评估和审批流程
- 建立更完善的监控和告警机制

## 总结

本报告分析了CI/CD流水线的运行状况,包括构建成功率、资源利用率和失败原因分析。
建议根据以上分析结果,持续优化构建流程,提高交付效率和质量。

---

**报告生成工具**: AI驱动的CI/CD优化系统  
**下次更新**: 建议每周生成一次报告以跟踪改进效果
"""
        
        return report

# 性能监控和异常检测
class PerformanceMonitor:
    """性能监控器"""
    
    def __init__(self):
        self.metrics_history = []
        self.anomaly_detector = AnomalyDetector()
        self.alert_manager = AlertManager()
        self.thresholds = self._initialize_thresholds()
    
    def collect_metrics(self) -> SystemMetrics:
        """收集系统指标"""
        # 模拟指标收集
        metrics = SystemMetrics(
            timestamp=datetime.now(),
            cpu_usage=np.random.normal(0.6, 0.1),
            memory_usage=np.random.normal(0.7, 0.15),
            disk_usage=np.random.normal(0.5, 0.1),
            network_io=np.random.normal(100, 20),
            response_time=np.random.normal(200, 50),
            error_rate=np.random.normal(0.02, 0.01),
            throughput=np.random.normal(1000, 100),
            active_users=np.random.randint(800, 1200)
        )
        
        # 确保指标在合理范围内
        metrics.cpu_usage = max(0, min(1, metrics.cpu_usage))
        metrics.memory_usage = max(0, min(1, metrics.memory_usage))
        metrics.disk_usage = max(0, min(1, metrics.disk_usage))
        metrics.error_rate = max(0, min(1, metrics.error_rate))
        metrics.response_time = max(0, metrics.response_time)
        
        self.metrics_history.append(metrics)
        
        # 保持最近1000个指标
        if len(self.metrics_history) > 1000:
            self.metrics_history = self.metrics_history[-1000:]
        
        return metrics
    
    def monitor_and_alert(self):
        """监控并生成告警"""
        current_metrics = self.collect_metrics()
        
        # 检查阈值告警
        threshold_alerts = self._check_thresholds(current_metrics)
        
        # 检查异常
        anomaly_alerts = self.anomaly_detector.detect_anomalies(current_metrics, self.metrics_history)
        
        # 发送告警
        all_alerts = threshold_alerts + anomaly_alerts
        for alert in all_alerts:
            self.alert_manager.send_alert(alert)
        
        return current_metrics, all_alerts
    
    def _check_thresholds(self, metrics: SystemMetrics) -> List[Alert]:
        """检查阈值告警"""
        alerts = []
        
        # CPU使用率告警
        if metrics.cpu_usage > self.thresholds['cpu_critical']:
            alerts.append(Alert(
                id=f"cpu_alert_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                timestamp=metrics.timestamp,
                severity=AlertSeverity.CRITICAL,
                source="cpu_monitor",
                message=f"CPU使用率过高: {metrics.cpu_usage:.1%}",
                metrics={'cpu_usage': metrics.cpu_usage},
                resolved=False,
                resolution_time=None,
                false_positive=False
            ))
        elif metrics.cpu_usage > self.thresholds['cpu_warning']:
            alerts.append(Alert(
                id=f"cpu_alert_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                timestamp=metrics.timestamp,
                severity=AlertSeverity.HIGH,
                source="cpu_monitor",
                message=f"CPU使用率较高: {metrics.cpu_usage:.1%}",
                metrics={'cpu_usage': metrics.cpu_usage},
                resolved=False,
                resolution_time=None,
                false_positive=False
            ))
        
        # 内存使用率告警
        if metrics.memory_usage > self.thresholds['memory_critical']:
            alerts.append(Alert(
                id=f"memory_alert_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                timestamp=metrics.timestamp,
                severity=AlertSeverity.CRITICAL,
                source="memory_monitor",
                message=f"内存使用率过高: {metrics.memory_usage:.1%}",
                metrics={'memory_usage': metrics.memory_usage},
                resolved=False,
                resolution_time=None,
                false_positive=False
            ))
        
        # 响应时间告警
        if metrics.response_time > self.thresholds['response_time_critical']:
            alerts.append(Alert(
                id=f"response_alert_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                timestamp=metrics.timestamp,
                severity=AlertSeverity.HIGH,
                source="response_monitor",
                message=f"响应时间过长: {metrics.response_time:.0f}ms",
                metrics={'response_time': metrics.response_time},
                resolved=False,
                resolution_time=None,
                false_positive=False
            ))
        
        # 错误率告警
        if metrics.error_rate > self.thresholds['error_rate_critical']:
            alerts.append(Alert(
                id=f"error_alert_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                timestamp=metrics.timestamp,
                severity=AlertSeverity.CRITICAL,
                source="error_monitor",
                message=f"错误率过高: {metrics.error_rate:.2%}",
                metrics={'error_rate': metrics.error_rate},
                resolved=False,
                resolution_time=None,
                false_positive=False
            ))
        
        return alerts
    
    def _initialize_thresholds(self) -> Dict[str, float]:
        """初始化告警阈值"""
        return {
            'cpu_warning': 0.8,
            'cpu_critical': 0.9,
            'memory_warning': 0.8,
            'memory_critical': 0.9,
            'disk_warning': 0.8,
            'disk_critical': 0.9,
            'response_time_warning': 500,  # ms
            'response_time_critical': 1000,  # ms
            'error_rate_warning': 0.05,  # 5%
            'error_rate_critical': 0.1,  # 10%
            'throughput_low': 500  # requests/min
        }

class AnomalyDetector:
    """异常检测器"""
    
    def detect_anomalies(self, current_metrics: SystemMetrics, 
                        history: List[SystemMetrics]) -> List[Alert]:
        """检测异常"""
        alerts = []
        
        if len(history) < 30:  # 需要足够的历史数据
            return alerts
        
        # 检测各项指标的异常
        anomalies = {
            'cpu_usage': self._detect_metric_anomaly('cpu_usage', current_metrics.cpu_usage, history),
            'memory_usage': self._detect_metric_anomaly('memory_usage', current_metrics.memory_usage, history),
            'response_time': self._detect_metric_anomaly('response_time', current_metrics.response_time, history),
            'error_rate': self._detect_metric_anomaly('error_rate', current_metrics.error_rate, history),
            'throughput': self._detect_metric_anomaly('throughput', current_metrics.throughput, history)
        }
        
        for metric_name, is_anomaly in anomalies.items():
            if is_anomaly:
                alerts.append(Alert(
                    id=f"anomaly_{metric_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
                    timestamp=current_metrics.timestamp,
                    severity=AlertSeverity.MEDIUM,
                    source="anomaly_detector",
                    message=f"检测到{metric_name}异常",
                    metrics={metric_name: getattr(current_metrics, metric_name)},
                    resolved=False,
                    resolution_time=None,
                    false_positive=False
                ))
        
        return alerts
    
    def _detect_metric_anomaly(self, metric_name: str, current_value: float, 
                             history: List[SystemMetrics]) -> bool:
        """检测单个指标的异常"""
        # 获取历史值
        historical_values = [getattr(m, metric_name) for m in history[-30:]]  # 最近30个值
        
        if len(historical_values) < 10:
            return False
        
        # 使用简单的统计方法检测异常
        mean = np.mean(historical_values)
        std = np.std(historical_values)
        
        # 如果当前值超出3个标准差,认为是异常
        threshold = 3.0
        
        return abs(current_value - mean) > threshold * std

class AlertManager:
    """告警管理器"""
    
    def __init__(self):
        self.active_alerts = []
        self.alert_history = []
        self.notification_channels = ['email', 'slack', 'webhook']
    
    def send_alert(self, alert: Alert):
        """发送告警"""
        # 检查是否是重复告警
        if not self._is_duplicate_alert(alert):
            self.active_alerts.append(alert)
            self.alert_history.append(alert)
            
            # 发送通知
            self._send_notification(alert)
            
            print(f"🚨 告警: {alert.message} (严重程度: {alert.severity.value})")
    
    def resolve_alert(self, alert_id: str):
        """解决告警"""
        for alert in self.active_alerts:
            if alert.id == alert_id:
                alert.resolved = True
                alert.resolution_time = datetime.now()
                self.active_alerts.remove(alert)
                break
    
    def _is_duplicate_alert(self, new_alert: Alert) -> bool:
        """检查是否是重复告警"""
        for active_alert in self.active_alerts:
            if (active_alert.source == new_alert.source and 
                active_alert.severity == new_alert.severity and
                (new_alert.timestamp - active_alert.timestamp).seconds < 300):  # 5分钟内
                return True
        return False
    
    def _send_notification(self, alert: Alert):
        """发送通知"""
        # 在实际应用中,这里会发送真实的通知
        # 这里只是模拟
        pass

if __name__ == "__main__":
    demo_intelligent_cicd_system()

总结与最佳实践

CI/CD最佳实践

class CICDBestPractices:
    """CI/CD最佳实践"""
    
    @staticmethod
    def get_pipeline_principles() -> Dict[str, List[str]]:
        """获取流水线原则"""
        return {
            '自动化原则': [
                '自动化构建和测试',
                '自动化部署流程',
                '自动化质量检查',
                '自动化回滚机制',
                '自动化监控告警'
            ],
            '快速反馈': [
                '快速构建执行',
                '并行测试运行',
                '实时状态通知',
                '快速失败检测',
                '及时问题修复'
            ],
            '质量保证': [
                '代码质量检查',
                '安全漏洞扫描',
                '性能基准测试',
                '兼容性验证',
                '文档同步更新'
            ],
            '可靠性': [
                '幂等性部署',
                '蓝绿部署策略',
                '金丝雀发布',
                '自动回滚',
                '健康检查'
            ]
        }
    
    @staticmethod
    def get_optimization_strategies() -> Dict[str, List[str]]:
        """获取优化策略"""
        return {
            '构建优化': [
                '增量构建',
                '并行构建',
                '构建缓存',
                '依赖管理',
                '资源复用'
            ],
            '测试优化': [
                '智能测试选择',
                '测试并行化',
                '测试数据管理',
                '不稳定测试处理',
                '测试环境隔离'
            ],
            '部署优化': [
                '部署策略选择',
                '环境一致性',
                '配置管理',
                '服务发现',
                '流量管理'
            ],
            '监控优化': [
                '关键指标监控',
                '异常检测',
                '性能分析',
                '日志聚合',
                '告警优化'
            ]
        }

def main():
    """主函数"""
    print("🚀 AI驱动的持续集成与部署优化系统")
    print("=" * 50)
    
    # 运行演示
    demo_intelligent_cicd_system()
    
    # 显示最佳实践
    practices = CICDBestPractices()
    pipeline_principles = practices.get_pipeline_principles()
    
    print("\n📋 CI/CD最佳实践:")
    print("-" * 30)
    
    for category, principle_list in pipeline_principles.items():
        print(f"\n{category}:")
        for principle in principle_list[:3]:
            print(f"  ✓ {principle}")
    
    print("\n🎯 CI/CD优化完成!")

if __name__ == "__main__":
    main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

CarlowZJ

我的文章对你有用的话,可以支持

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值