Python性能调优秘籍：profiling工具与优化策略详解

最新推荐文章于 2025-10-13 18:30:34 发布

原创最新推荐文章于 2025-10-13 18:30:34 发布 · 886 阅读

20 ·

CC 4.0 BY-SA版权

文章标签：

#python

python 专栏收录该内容

77 篇文章

订阅专栏

在Python开发过程中，性能优化往往是一个绕不开的话题。虽然Python以其简洁优雅的语法著称，但在某些场景下，性能问题可能会成为项目的瓶颈。本文将深入探讨Python性能分析与优化的方方面面，从基础的profiling工具到高级优化策略，帮助开发者系统性地解决性能问题。

为什么需要性能调优？

在开始深入技术细节之前，我们先来理解性能调优的重要性：

用户体验：响应时间直接影响用户满意度
资源成本：优化后的程序能够节省服务器资源和运营成本
扩展性：高效的代码更容易应对业务增长带来的负载增加
竞争优势：在同等功能下，性能更好的产品往往更有竞争力

Python性能分析基础

性能瓶颈的常见类型

CPU密集型：大量计算导致CPU占用率高
内存密集型：内存使用不当或内存泄漏
I/O密集型：频繁的文件读写或网络请求
算法复杂度问题：时间复杂度或空间复杂度过高

性能分析的基本原则

测量先于优化：在进行任何优化之前，必须先准确测量现有性能，找到真正的瓶颈所在。盲目优化不仅浪费时间，还可能引入新的问题。

核心Profiling工具详解

1. cProfile - Python内置性能分析器

cProfile是Python标准库中最重要的性能分析工具，能够详细记录程序执行过程中每个函数的调用次数和执行时间。

基本使用方法

import cProfile
import pstats

def fibonacci(n):
    if n <= 1:
        return n
    return fibonacci(n-1) + fibonacci(n-2)

def main():
    result = fibonacci(30)
    print(f"Fibonacci(30) = {result}")

# 方法1：直接在代码中使用
cProfile.run('main()')

# 方法2：保存分析结果到文件
cProfile.run('main()', 'profile_output.prof')

# 方法3：使用pstats分析结果
pr = cProfile.Profile()
pr.enable()
main()
pr.disable()

# 分析结果
stats = pstats.Stats(pr)
stats.sort_stats('cumulative')
stats.print_stats(10)  # 显示前10个最耗时的函数

命令行使用

# 直接运行脚本并分析
python -m cProfile -s cumulative your_script.py

# 保存结果到文件
python -m cProfile -o profile.prof your_script.py

结果解读

cProfile输出包含以下关键信息：

ncalls：函数调用次数
tottime：函数本身执行时间（不包括子函数）
percall：平均每次调用时间
cumtime：累积时间（包括子函数）
filename:lineno(function)：函数位置

2. line_profiler - 行级性能分析

line_profiler能够提供逐行的执行时间分析，对于定位具体代码行的性能问题非常有用。

安装与使用

pip install line_profiler

@profile
def slow_function():
    total = 0
    for i in range(1000000):
        total += i * i
    
    result = []
    for i in range(10000):
        result.append(str(i))
    
    return total, result

if __name__ == "__main__":
    slow_function()

# 运行行级分析
kernprof -l -v your_script.py

3. memory_profiler - 内存使用分析

内存profiler帮助我们了解程序的内存使用模式，发现内存泄漏和内存使用峰值。

from memory_profiler import profile
import numpy as np

@profile
def memory_intensive_function():
    # 创建大数组
    big_array = np.random.random((1000, 1000))
    
    # 进行一些计算
    result = np.dot(big_array, big_array.T)
    
    # 创建列表
    big_list = [i for i in range(1000000)]
    
    return result.sum()

if __name__ == "__main__":
    memory_intensive_function()

4. py-spy - 生产环境性能分析

py-spy是一个采样性能分析器，特别适合在生产环境中使用，因为它对目标程序的影响很小。

# 安装
pip install py-spy

# 分析运行中的Python进程
py-spy top --pid 12345

# 生成火焰图
py-spy record -o profile.svg --pid 12345

# 分析特定时长
py-spy record -o profile.svg --duration 30 --pid 12345

高级Profiling技术

1. 统计性能分析

import time
import functools
from collections import defaultdict

class PerformanceTracker:
    def __init__(self):
        self.call_times = defaultdict(list)
        self.call_counts = defaultdict(int)
    
    def track(self, func_name):
        def decorator(func):
            @functools.wraps(func)
            def wrapper(*args, **kwargs):
                start_time = time.perf_counter()
                result = func(*args, **kwargs)
                end_time = time.perf_counter()
                
                execution_time = end_time - start_time
                self.call_times[func_name].append(execution_time)
                self.call_counts[func_name] += 1
                
                return result
            return wrapper
        return decorator
    
    def report(self):
        print("Performance Report:")
        print("-" * 50)
        for func_name in self.call_times:
            times = self.call_times[func_name]
            avg_time = sum(times) / len(times)
            total_time = sum(times)
            count = self.call_counts[func_name]
            
            print(f"{func_name}:")
            print(f"  Calls: {count}")
            print(f"  Total time: {total_time:.4f}s")
            print(f"  Average time: {avg_time:.4f}s")
            print(f"  Min time: {min(times):.4f}s")
            print(f"  Max time: {max(times):.4f}s")
            print()

# 使用示例
tracker = PerformanceTracker()

@tracker.track("database_query")
def query_database():
    time.sleep(0.1)  # 模拟数据库查询
    return "data"

@tracker.track("data_processing")
def process_data(data):
    time.sleep(0.05)  # 模拟数据处理
    return f"processed_{data}"

# 运行测试
for _ in range(10):
    data = query_database()
    process_data(data)

tracker.report()

2. 基准测试框架

import timeit
import matplotlib.pyplot as plt
import numpy as np

class BenchmarkSuite:
    def __init__(self):
        self.results = {}
    
    def benchmark(self, name, func, setup="", number=1000):
        """执行基准测试"""
        exec_time = timeit.timeit(func, setup=setup, number=number)
        self.results[name] = exec_time / number  # 平均执行时间
        return exec_time
    
    def compare_algorithms(self, algorithms, test_data_sizes):
        """比较不同算法在不同数据规模下的性能"""
        results = {name: [] for name in algorithms.keys()}
        
        for size in test_data_sizes:
            print(f"Testing with data size: {size}")
            for name, func in algorithms.items():
                # 准备测试数据
                setup = f"data = list(range({size}))"
                # 执行测试
                time_taken = self.benchmark(
                    f"{name}_{size}", 
                    f"{func.__name__}(data)", 
                    setup=setup + f"; {func.__name__} = {repr(func)}", 
                    number=100
                )
                results[name].append(time_taken)
        
        return results, test_data_sizes
    
    def plot_comparison(self, results, data_sizes, title="Algorithm Performance Comparison"):
        """绘制性能比较图"""
        plt.figure(figsize=(12, 8))
        
        for name, times in results.items():
            plt.plot(data_sizes, times, marker='o', label=name)
        
        plt.xlabel('Data Size')
        plt.ylabel('Average Execution Time (seconds)')
        plt.title(title)
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.yscale('log')  # 使用对数刻度
        plt.show()

# 示例：比较不同排序算法
def bubble_sort(arr):
    n = len(arr)
    for i in range(n):
        for j in range(0, n - i - 1):
            if arr[j] > arr[j + 1]:
                arr[j], arr[j + 1] = arr[j + 1], arr[j]
    return arr

def quick_sort(arr):
    if len(arr) <= 1:
        return arr
    pivot = arr[len(arr) // 2]
    left = [x for x in arr if x < pivot]
    middle = [x for x in arr if x == pivot]
    right = [x for x in arr if x > pivot]
    return quick_sort(left) + middle + quick_sort(right)

# 运行基准测试
benchmark = BenchmarkSuite()
algorithms = {
    "Bubble Sort": bubble_sort,
    "Quick Sort": quick_sort,
    "Built-in Sort": sorted
}

data_sizes = [100, 500, 1000, 2000]
results, sizes = benchmark.compare_algorithms(algorithms, data_sizes)
benchmark.plot_comparison(results, sizes)

核心优化策略

1. 算法优化

算法优化是性能提升的根本途径。选择合适的数据结构和算法能够带来数量级的性能提升。

数据结构选择

import time
from collections import deque, defaultdict
import bisect

# 列表 vs 集合查找性能比较
def compare_lookup_performance():
    # 准备数据
    data_list = list(range(10000))
    data_set = set(data_list)
    search_items = [1000, 5000, 9999]
    
    # 列表查找
    start = time.perf_counter()
    for item in search_items * 1000:
        item in data_list
    list_time = time.perf_counter() - start
    
    # 集合查找
    start = time.perf_counter()
    for item in search_items * 1000:
        item in data_set
    set_time = time.perf_counter() - start
    
    print(f"List lookup time: {list_time:.4f}s")
    print(f"Set lookup time: {set_time:.4f}s")
    print(f"Set is {list_time/set_time:.1f}x faster")

# 队列操作性能比较
def compare_queue_performance():
    n = 100000
    
    # 使用列表模拟队列（效率低）
    queue_list = []
    start = time.perf_counter()
    for i in range(n):
        queue_list.append(i)
    for i in range(n):
        queue_list.pop(0)  # 从头部删除，O(n)操作
    list_time = time.perf_counter() - start
    
    # 使用deque（高效）
    queue_deque = deque()
    start = time.perf_counter()
    for i in range(n):
        queue_deque.append(i)
    for i in range(n):
        queue_deque.popleft()  # O(1)操作
    deque_time = time.perf_counter() - start
    
    print(f"List queue time: {list_time:.4f}s")
    print(f"Deque queue time: {deque_time:.4f}s")
    print(f"Deque is {list_time/deque_time:.1f}x faster")

compare_lookup_performance()
compare_queue_performance()

缓存机制

import functools
import time

# 函数结果缓存
@functools.lru_cache(maxsize=128)
def expensive_calculation(n):
    """模拟耗时计算"""
    time.sleep(0.1)
    return n * n

# 类级别缓存
class DataProcessor:
    def __init__(self):
        self._cache = {}
    
    def process_data(self, data_id):
        if data_id in self._cache:
            return self._cache[data_id]
        
        # 模拟耗时处理
        time.sleep(0.1)
        result = f"processed_{data_id}"
        self._cache[data_id] = result
        return result
    
    def clear_cache(self):
        self._cache.clear()

# 缓存装饰器
def cache_result(max_size=100):
    def decorator(func):
        cache = {}
        
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            # 创建缓存键
            key = str(args) + str(sorted(kwargs.items()))
            
            if key in cache:
                return cache[key]
            
            result = func(*args, **kwargs)
            
            # 简单的LRU策略：如果缓存满了，删除一个项目
            if len(cache) >= max_size:
                cache.pop(next(iter(cache)))
            
            cache[key] = result
            return result
        
        wrapper.cache_clear = cache.clear
        wrapper.cache_info = lambda: f"Cache size: {len(cache)}"
        return wrapper
    return decorator

@cache_result(max_size=50)
def fibonacci_cached(n):
    if n <= 1:
        return n
    return fibonacci_cached(n-1) + fibonacci_cached(n-2)

2. 代码级优化

循环优化

import numpy as np

# 避免重复计算
def optimized_loop():
    data = list(range(1000000))
    
    # 低效版本
    start = time.perf_counter()
    result1 = []
    for i in range(len(data)):  # len()在每次迭代时都被调用
        if data[i] % 2 == 0:  # 重复的模运算
            result1.append(data[i] * 2)
    time1 = time.perf_counter() - start
    
    # 优化版本
    start = time.perf_counter()
    result2 = []
    data_len = len(data)  # 预计算长度
    for i in range(data_len):
        value = data[i]  # 避免重复索引
        if value & 1 == 0:  # 使用位运算检查偶数
            result2.append(value << 1)  # 使用位移代替乘法
    time2 = time.perf_counter() - start
    
    # 更优化版本：使用列表推导
    start = time.perf_counter()
    result3 = [x << 1 for x in data if x & 1 == 0]
    time3 = time.perf_counter() - start
    
    print(f"Original: {time1:.4f}s")
    print(f"Optimized: {time2:.4f}s")
    print(f"List comprehension: {time3:.4f}s")

# 向量化操作
def vectorization_example():
    data = np.random.random(1000000)
    
    # 纯Python循环
    start = time.perf_counter()
    result1 = []
    for x in data:
        result1.append(x * x + 2 * x + 1)
    time1 = time.perf_counter() - start
    
    # NumPy向量化
    start = time.perf_counter()
    result2 = data * data + 2 * data + 1
    time2 = time.perf_counter() - start
    
    print(f"Python loop: {time1:.4f}s")
    print(f"NumPy vectorization: {time2:.4f}s")
    print(f"NumPy is {time1/time2:.1f}x faster")

optimized_loop()
vectorization_example()

字符串操作优化

def string_optimization():
    words = ["hello", "world", "python", "performance"] * 10000
    
    # 低效：字符串连接
    start = time.perf_counter()
    result1 = ""
    for word in words:
        result1 += word + " "
    time1 = time.perf_counter() - start
    
    # 高效：使用join
    start = time.perf_counter()
    result2 = " ".join(words)
    time2 = time.perf_counter() - start
    
    # 使用列表收集然后join
    start = time.perf_counter()
    temp_list = []
    for word in words:
        temp_list.append(word)
    result3 = " ".join(temp_list)
    time3 = time.perf_counter() - start
    
    print(f"String concatenation: {time1:.4f}s")
    print(f"Join method: {time2:.4f}s")
    print(f"List + join: {time3:.4f}s")

string_optimization()

3. 并发与并行优化

多线程处理I/O密集任务

import concurrent.futures
import requests
import time

def fetch_url(url):
    """模拟网络请求"""
    response = requests.get(url)
    return len(response.content)

def compare_concurrent_performance():
    urls = [
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/1"
    ]
    
    # 顺序执行
    start = time.perf_counter()
    results1 = [fetch_url(url) for url in urls]
    time1 = time.perf_counter() - start
    
    # 多线程并发
    start = time.perf_counter()
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        results2 = list(executor.map(fetch_url, urls))
    time2 = time.perf_counter() - start
    
    print(f"Sequential: {time1:.2f}s")
    print(f"Concurrent: {time2:.2f}s")
    print(f"Speedup: {time1/time2:.1f}x")

# 注意：由于网络请求的不确定性，实际运行时请谨慎测试

多进程处理CPU密集任务

import multiprocessing as mp
import math

def cpu_intensive_task(n):
    """CPU密集型任务：计算素数"""
    def is_prime(num):
        if num < 2:
            return False
        for i in range(2, int(math.sqrt(num)) + 1):
            if num % i == 0:
                return False
        return True
    
    return sum(1 for i in range(n) if is_prime(i))

def compare_multiprocessing():
    n = 50000
    
    # 单进程
    start = time.perf_counter()
    result1 = cpu_intensive_task(n)
    time1 = time.perf_counter() - start
    
    # 多进程
    start = time.perf_counter()
    with mp.Pool() as pool:
        # 将任务分割为更小的块
        chunk_size = n // mp.cpu_count()
        tasks = [(i * chunk_size, (i + 1) * chunk_size) for i in range(mp.cpu_count())]
        tasks[-1] = (tasks[-1][0], n)  # 调整最后一个任务的范围
        
        results = pool.starmap(lambda start, end: cpu_intensive_task(end) - cpu_intensive_task(start), tasks)
        result2 = sum(results)
    time2 = time.perf_counter() - start
    
    print(f"Single process: {time1:.2f}s")
    print(f"Multi process: {time2:.2f}s")
    print(f"Speedup: {time1/time2:.1f}x")

内存优化策略

1. 内存分析与监控

import psutil
import gc
import sys
from pympler import muppy, summary

class MemoryMonitor:
    def __init__(self):
        self.process = psutil.Process()
        self.initial_memory = self.get_memory_usage()
    
    def get_memory_usage(self):
        """获取当前内存使用量（MB）"""
        return self.process.memory_info().rss / 1024 / 1024
    
    def memory_checkpoint(self, description=""):
        """记录内存使用检查点"""
        current_memory = self.get_memory_usage()
        change = current_memory - self.initial_memory
        print(f"Memory {description}: {current_memory:.1f}MB (Δ{change:+.1f}MB)")
        return current_memory
    
    def analyze_objects(self):
        """分析内存中的对象"""
        all_objects = muppy.get_objects()
        sum1 = summary.summarize(all_objects)
        summary.print_(sum1)

# 内存优化示例
def memory_optimization_example():
    monitor = MemoryMonitor()
    monitor.memory_checkpoint("Initial")
    
    # 创建大量数据
    big_list = list(range(1000000))
    monitor.memory_checkpoint("After creating big_list")
    
    # 使用生成器代替列表
    big_generator = (x for x in range(1000000))
    monitor.memory_checkpoint("After creating generator")
    
    # 删除大对象并强制垃圾回收
    del big_list
    gc.collect()
    monitor.memory_checkpoint("After cleanup")

memory_optimization_example()

2. 生成器与迭代器

def memory_efficient_processing():
    # 内存效率低：一次性加载所有数据
    def process_file_memory_intensive(filename):
        with open(filename, 'r') as f:
            lines = f.readlines()  # 全部加载到内存
            return [line.strip().upper() for line in lines if line.strip()]
    
    # 内存效率高：使用生成器
    def process_file_memory_efficient(filename):
        with open(filename, 'r') as f:
            for line in f:  # 逐行读取
                line = line.strip()
                if line:
                    yield line.upper()
    
    # 数据管道示例
    def create_data_pipeline():
        def read_numbers():
            """生成数字序列"""
            for i in range(1000000):
                yield i
        
        def filter_even(numbers):
            """过滤偶数"""
            for num in numbers:
                if num % 2 == 0:
                    yield num
        
        def square_numbers(numbers):
            """计算平方"""
            for num in numbers:
                yield num * num
        
        # 组合管道
        pipeline = square_numbers(filter_even(read_numbers()))
        
        # 只计算前10个结果
        return [next(pipeline) for _ in range(10)]
    
    result = create_data_pipeline()
    print("Pipeline result:", result)

memory_efficient_processing()

3. 对象池模式

class ObjectPool:
    def __init__(self, factory_func, max_size=10):
        self.factory_func = factory_func
        self.max_size = max_size
        self.pool = []
    
    def acquire(self):
        """获取对象"""
        if self.pool:
            return self.pool.pop()
        return self.factory_func()
    
    def release(self, obj):
        """归还对象"""
        if len(self.pool) < self.max_size:
            # 重置对象状态
            if hasattr(obj, 'reset'):
                obj.reset()
            self.pool.append(obj)

class ExpensiveObject:
    def __init__(self):
        self.data = [0] * 10000  # 模拟昂贵的初始化
        self.state = "initialized"
    
    def reset(self):
        self.state = "reset"
        # 重置其他必要的状态

# 使用对象池
def use_object_pool():
    pool = ObjectPool(ExpensiveObject, max_size=5)
    
    # 获取对象
    obj1 = pool.acquire()
    obj2 = pool.acquire()
    
    # 使用对象
    obj1.state = "in_use"
    obj2.state = "in_use"
    
    # 归还对象
    pool.release(obj1)
    pool.release(obj2)
    
    # 再次获取（复用之前的对象）
    obj3 = pool.acquire()
    print(f"Reused object state: {obj3.state}")

use_object_pool()

实际应用案例

案例1：Web API性能优化

import asyncio
import aiohttp
from functools import wraps
import time

# 异步缓存装饰器
def async_cache(ttl=300):
    def decorator(func):
        cache = {}
        
        @wraps(func)
        async def wrapper(*args, **kwargs):
            key = str(args) + str(sorted(kwargs.items()))
            now = time.time()
            
            if key in cache:
                result, timestamp = cache[key]
                if now - timestamp < ttl:
                    return result
            
            result = await func(*args, **kwargs)
            cache[key] = (result, now)
            return result
        
        return wrapper
    return decorator

class APIClient:
    def __init__(self):
        self.session = None
    
    async def __aenter__(self):
        self.session = aiohttp.ClientSession()
        return self
    
    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.session.close()
    
    @async_cache(ttl=60)
    async def fetch_user_data(self, user_id):
        """获取用户数据（带缓存）"""
        async with self.session.get(f"https://api.example.com/users/{user_id}") as resp:
            return await resp.json()
    
    async def fetch_multiple_users(self, user_ids):
        """并发获取多个用户数据"""
        tasks = [self.fetch_user_data(user_id) for user_id in user_ids]
        return await asyncio.gather(*tasks)

# 性能对比
async def compare_api_performance():
    user_ids = list(range(1, 21))
    
    async with APIClient() as client:
        # 顺序获取
        start = time.perf_counter()
        sequential_results = []
        for user_id in user_ids:
            result = await client.fetch_user_data(user_id)
            sequential_results.append(result)
        sequential_time = time.perf_counter() - start
        
        # 并发获取
        start = time.perf_counter()
        concurrent_results = await client.fetch_multiple_users(user_ids)
        concurrent_time = time.perf_counter() - start
        
        print(f"Sequential: {sequential_time:.2f}s")
        print(f"Concurrent: {concurrent_time:.2f}s")
        print(f"Speedup: {sequential_time/concurrent_time:.1f}x")

# asyncio.run(compare_api_performance())  # 取消注释以运行

案例2：数据处理优化

import pandas as pd
import numpy as np
from multiprocessing import Pool
import dask.dataframe as dd

class DataProcessor:
    def __init__(self):
        self.processing_functions = {
            'normalize': self._normalize_data,
            'aggregate': self._aggregate_data,
            'filter': self._filter_data
        }
    
    def _normalize_data(self, df):
        """数据标准化"""
        numeric_columns = df.select_dtypes(include=[np.number]).columns
        df[numeric_columns] = (df[numeric_columns] - df[numeric_columns].mean()) / df[numeric_columns].std()
        return df
    
    def _aggregate_data(self, df):
        """数据聚合"""
        return df.groupby('category').agg({
            'value': ['mean', 'sum', 'count'],
            'score': ['min', 'max']
        })
    
    def _filter_data(self, df):
        """数据过滤"""
        return df[df['score'] > df['score'].quantile(0.7)]
    
    def process_dataframe_optimized(self, df, operations):
        """优化的数据处理流水线"""
        # 链式操作，避免中间结果
        result = df.copy()
        for operation in operations:
            if operation in self.processing_functions:
                result = self.processing_functions[operation](result)
        return result
    
    def process_large_dataset_parallel(self, file_path, operations, chunk_size=10000):
        """并行处理大数据集"""
        # 使用Dask进行分布式处理
        ddf = dd.read_csv(file_path, blocksize=chunk_size)
        
        for operation in operations:
            if operation == 'normalize':
                numeric_columns = ddf.select_dtypes(include=[np.number]).columns
                ddf[numeric_columns] = (ddf[numeric_columns] - ddf[numeric_columns].mean()) / ddf[numeric_columns].std()
            elif operation == 'filter':
                ddf = ddf[ddf['score'] > ddf['score'].quantile(0.7)]
        
        return ddf.compute()

# 性能测试
def benchmark_data_processing():
    # 创建测试数据
    np.random.seed(42)
    data = {
        'category': np.random.choice(['A', 'B', 'C'], 100000),
        'value': np.random.normal(0, 1, 100000),
        'score': np.random.uniform(0, 100, 100000)
    }
    df = pd.DataFrame(data)
    
    processor = DataProcessor()
    operations = ['normalize', 'filter']
    
    # 标准处理
    start = time.perf_counter()
    result1 = processor.process_dataframe_optimized(df, operations)
    time1 = time.perf_counter() - start
    
    print(f"Optimized processing: {time1:.4f}s")
    print(f"Result shape: {result1.shape}")

benchmark_data_processing()

性能监控与持续优化

1. 应用性能监控（APM）

import time
import json
from datetime import datetime
from contextlib import contextmanager

class PerformanceMonitor:
    def __init__(self):
        self.metrics = []
        self.alerts = []
    
    @contextmanager
    def monitor_function(self, func_name, threshold=1.0):
        """监控函数执行时间"""
        start_time = time.perf_counter()
        start_memory = psutil.Process().memory_info().rss
        
        try:
            yield
        finally:
            end_time = time.perf_counter()
            end_memory = psutil.Process().memory_info().rss
            
            execution_time = end_time - start_time
            memory_delta = end_memory - start_memory
            
            metric = {
                'function': func_name,
                'execution_time': execution_time,
                'memory_delta': memory_delta,
                'timestamp': datetime.now().isoformat()
            }
            
            self.metrics.append(metric)
            
            # 性能告警
            if execution_time > threshold:
                alert = {
                    'type': 'slow_function',
                    'function': func_name,
                    'execution_time': execution_time,
                    'threshold': threshold,
                    'timestamp': datetime.now().isoformat()
                }
                self.alerts.append(alert)
                print(f"ALERT: {func_name} took {execution_time:.4f}s (threshold: {threshold}s)")
    
    def get_performance_report(self):
        """生成性能报告"""
        if not self.metrics:
            return "No metrics collected"
        
        # 计算统计信息
        execution_times = [m['execution_time'] for m in self.metrics]
        memory_deltas = [m['memory_delta'] for m in self.metrics]
        
        report = {
            'total_calls': len(self.metrics),
            'avg_execution_time': sum(execution_times) / len(execution_times),
            'max_execution_time': max(execution_times),
            'total_memory_delta': sum(memory_deltas),
            'alerts_count': len(self.alerts),
            'recent_alerts': self.alerts[-5:]  # 最近5个告警
        }
        
        return json.dumps(report, indent=2)

# 使用监控器
monitor = PerformanceMonitor()

@monitor.monitor_function
def example_function():
    time.sleep(0.1)  # 模拟工作
    return "result"

# 运行示例
with monitor.monitor_function("example_function", threshold=0.05):
    example_function()

print(monitor.get_performance_report())

2. 自动化性能回归测试

import unittest
import json
import os
from datetime import datetime

class PerformanceTest(unittest.TestCase):
    BASELINE_FILE = "performance_baseline.json"
    TOLERANCE = 0.1  # 10%的性能回归容忍度
    
    def setUp(self):
        self.baseline = self.load_baseline()
    
    def load_baseline(self):
        """加载性能基线"""
        if os.path.exists(self.BASELINE_FILE):
            with open(self.BASELINE_FILE, 'r') as f:
                return json.load(f)
        return {}
    
    def save_baseline(self, test_name, execution_time):
        """保存性能基线"""
        baseline = self.load_baseline()
        baseline[test_name] = {
            'execution_time': execution_time,
            'timestamp': datetime.now().isoformat()
        }
        with open(self.BASELINE_FILE, 'w') as f:
            json.dump(baseline, f, indent=2)
    
    def assert_performance(self, test_name, execution_time, update_baseline=False):
        """断言性能是否符合预期"""
        if update_baseline or test_name not in self.baseline:
            self.save_baseline(test_name, execution_time)
            print(f"Baseline updated for {test_name}: {execution_time:.4f}s")
            return
        
        baseline_time = self.baseline[test_name]['execution_time']
        max_allowed_time = baseline_time * (1 + self.TOLERANCE)
        
        self.assertLessEqual(
            execution_time, 
            max_allowed_time,
            f"Performance regression detected: {test_name} took {execution_time:.4f}s "
            f"(baseline: {baseline_time:.4f}s, max allowed: {max_allowed_time:.4f}s)"
        )
    
    def test_fibonacci_performance(self):
        """测试斐波那契函数性能"""
        start = time.perf_counter()
        result = fibonacci_cached(30)
        execution_time = time.perf_counter() - start
        
        self.assert_performance("fibonacci_30", execution_time)
        self.assertEqual(result, 832040)  # 功能正确性测试

# 运行性能测试
if __name__ == "__main__":
    unittest.main(verbosity=2)