突破单核瓶颈：多核协程调度策略

TravisBytes

于 2024-12-25 16:51:22 发布

阅读量1.1k

点赞数 24

分类专栏：网络 # 网络相关问题文章标签：网络协议网络协程 coroutine c++ redis

本文链接：https://blog.youkuaiyun.com/weixin_43925427/article/details/144723217

版权

网络同时被 2 个专栏收录

12 篇文章

订阅专栏

网络相关问题

4 篇文章

订阅专栏

1. 当前单线程协程调度器概述

首先，我们来看一下当前的单线程协程调度器的关键实现。以下是核心数据结构和线程局部存储的定义。

1.1 调度器结构

typedef struct _nty_schedule {
    uint64_t birth;
    ucontext_t ctx;
    void *stack;
    size_t stack_size;
    int spawned_coroutines;
    
    // 调度相关队列
    nty_coroutine_queue ready;     // 就绪队列
    nty_coroutine_queue defer;     // 延迟队列
    nty_coroutine_link busy;       // 忙碌链表
    nty_coroutine_rbtree_sleep sleeping;  // 睡眠红黑树
    nty_coroutine_rbtree_wait waiting;    // 等待红黑树
    
    // epoll相关
    int poller_fd;
    struct epoll_event eventlist[NTY_CO_MAX_EVENTS];
} nty_schedule;

1.2 线程局部存储

pthread_key_t global_sched_key;
static pthread_once_t sched_key_once = PTHREAD_ONCE_INIT;

当前的调度器基于单线程实现，所有协程都在一个线程中调度和执行。这种设计在单核或低并发场景下表现良好，但在多核处理器上无法充分利用多核资源。

2. 多核协程调度器设计

为了充分利用多核处理器的性能，需将协程调度器扩展为支持多线程，每个线程独立管理一部分协程，并通过工作窃取和负载均衡机制实现高效的多核调度。

2.1 多线程多核方案

设计一个多线程调度器，每个线程拥有自己的调度器实例，负责管理一部分协程。通过工作窃取机制，当某个线程的就绪队列为空时，可以从其他线程窃取协程以执行。

多线程调度器结构

typedef struct _nty_thread_schedule {
    int thread_count;                   // 线程数量
    pthread_t *thread_ids;              // 线程ID数组
    nty_schedule **thread_schedules;    // 每个线程的调度器
    
    // 任务窃取相关
    pthread_mutex_t steal_lock;
    nty_coroutine_queue *steal_queues;  // 可窃取任务队列
    
    // 负载均衡相关
    pthread_mutex_t balance_lock;
    uint32_t *thread_loads;            // 每个线程的负载情况
} nty_thread_schedule;

2.2 关键调度算法

工作窃取算法

当某个线程没有可执行的协程时，可以尝试从其他线程的就绪队列中窃取协程，以提高资源利用率。

void work_stealing(nty_thread_schedule *ts) {
    // 当前线程无任务时
    if (TAILQ_EMPTY(&current_schedule->ready)) {
        pthread_mutex_lock(&ts->steal_lock);
        
        // 从其他线程窃取任务
        for (int i = 0; i < ts->thread_count; i++) {
            if (i == current_thread_id) continue;
            
            nty_coroutine *stolen = steal_task(ts->steal_queues[i]);
            if (stolen) {
                TAILQ_INSERT_TAIL(&current_schedule->ready, stolen, ready_next);
                break;
            }
        }
        
        pthread_mutex_unlock(&ts->steal_lock);
    }
}

负载均衡

通过监控各线程的负载情况，动态调整各线程间的协程分配，避免某些线程过载而其他线程空闲。

void balance_load(nty_thread_schedule *ts) {
    pthread_mutex_lock(&ts->balance_lock);
    
    // 计算平均负载
    uint32_t total_load = 0;
    for (int i = 0; i < ts->thread_count; i++) {
        total_load += ts->thread_loads[i];
    }
    uint32_t avg_load = total_load / ts->thread_count;
    
    // 迁移任务实现负载均衡
    for (int i = 0; i < ts->thread_count; i++) {
        if (ts->thread_loads[i] > avg_load * 1.2) { // 超过平均负载20%
            migrate_tasks(ts, i);
        }
    }
    
    pthread_mutex_unlock(&ts->balance_lock);
}

2.3 关键优化点

无锁队列：采用无锁队列以减少锁竞争，提高任务窃取和调度的效率。
协程亲和性：尽量将相关协程调度到同一线程或核心，提升缓存命中率，减少上下文切换开销。
共享内存：在多进程方案中，使用共享内存而非管道实现进程间通信，提升通信效率。
自适应负载均衡阈值：根据系统负载动态调整负载均衡的触发阈值，避免频繁的负载均衡操作。
协程优先级支持：为协程分配优先级，优先调度高优先级的协程，提高关键任务的响应速度。

3. 多核协程调度器详细实现

下面将详细介绍多核协程调度器的实现，包括数据结构、调度线程函数、协程执行、工作窃取、负载均衡及其他优化。

3.1 数据结构定义

协程结构

typedef struct _nty_coroutine {
    uint64_t birth;
    ucontext_t ctx;
    void *stack;
    size_t stack_size;
    int priority;
    // 其他成员...
} nty_coroutine;

就绪队列结构

typedef struct _nty_coroutine_queue {
    nty_coroutine **coroutines;
    size_t head;
    size_t tail;
    size_t capacity;
    pthread_mutex_t lock;
} nty_coroutine_queue;

多线程调度器结构

typedef struct _nty_thread_schedule {
    int thread_count;
    pthread_t *thread_ids;
    nty_schedule **thread_schedules;
    
    // 任务窃取相关
    pthread_mutex_t steal_lock;
    nty_coroutine_queue *steal_queues;
    
    // 负载均衡相关
    pthread_mutex_t balance_lock;
    uint32_t *thread_loads;
} nty_thread_schedule;

全局调度器

nty_thread_schedule *global_thread_schedule;

3.2 调度线程运行函数

每个调度线程运行一个循环，负责从自己的就绪队列中获取并执行协程，若无可执行协程，则尝试进行工作窃取，并定期进行负载均衡。

void* schedule_thread_func(void *arg) {
    nty_thread_schedule *ts = (nty_thread_schedule*)arg;
    int thread_id = get_thread_id(); // 实现获取当前线程ID的方法
    nty_schedule *sched = ts->thread_schedules[thread_id];
    
    while (1) {
        // 获取下一个协程
        nty_coroutine *co = dequeue_ready(&sched->ready);
        if (co) {
            ts->thread_loads[thread_id]++;
            execute_coroutine(co); // 执行协程的上下文切换
            ts->thread_loads[thread_id]--;
        } else {
            // 如果没有可执行的协程，尝试工作窃取
            work_stealing(ts);
        }

        // 定期进行负载均衡
        balance_load(ts);
        
        // 处理其他调度事件，如协程睡眠、等待等
        handle_scheduling_events(sched);
    }
    
    return NULL;
}

3.3 协程执行函数

执行协程涉及保存当前上下文，切换到协程的上下文，并在协程完成后恢复调度器的上下文。

void execute_coroutine(nty_coroutine *co) {
    nty_schedule *sched = get_current_schedule(); // 获取当前调度器
    if (swapcontext(&sched->ctx, &co->ctx) == -1) {
        perror("swapcontext failed");
        exit(EXIT_FAILURE);
    }
}

3.4 获取当前线程ID的方法

为了支持工作窃取和负载统计，需要能够获取当前线程的唯一标识。

int get_thread_id() {
    // 使用线程局部存储存储线程ID
    static __thread int thread_id = -1;
    if (thread_id == -1) {
        pthread_mutex_lock(&global_thread_schedule->steal_lock);
        for (int i = 0; i < global_thread_schedule->thread_count; i++) {
            if (pthread_equal(pthread_self(), global_thread_schedule->thread_ids[i])) {
                thread_id = i;
                break;
            }
        }
        pthread_mutex_unlock(&global_thread_schedule->steal_lock);
    }
    return thread_id;
}

3.5 协程创建与调度

创建协程并将其分配到负载最小的线程，以平衡各线程的负载。

nty_coroutine* nty_coroutine_create(void (*func)(void *), void *arg) {
    nty_coroutine *co = malloc(sizeof(nty_coroutine));
    // 初始化协程上下文、栈等
    getcontext(&co->ctx);
    co->stack = malloc(STACK_SIZE);
    co->stack_size = STACK_SIZE;
    co->ctx.uc_stack.ss_sp = co->stack;
    co->ctx.uc_stack.ss_size = STACK_SIZE;
    co->ctx.uc_link = &global_thread_schedule->thread_schedules[0]->ctx; // 切回主调度器上下文
    makecontext(&co->ctx, (void (*)(void))func, 1, arg);
    
    // 将协程分配到负载最小的线程
    nty_thread_schedule *ts = global_thread_schedule;
    int target_thread = find_least_loaded_thread(ts);
    enqueue_ready(&ts->thread_schedules[target_thread]->ready, co);
    ts->thread_loads[target_thread]++;
    
    return co;
}

查找负载最小的线程

int find_least_loaded_thread(nty_thread_schedule *ts) {
    uint32_t min_load = UINT32_MAX;
    int target_thread = 0;
    for (int i = 0; i < ts->thread_count; i++) {
        if (ts->thread_loads[i] < min_load) {
            min_load = ts->thread_loads[i];
            target_thread = i;
        }
    }
    return target_thread;
}

3.6 工作窃取算法

当当前线程没有可执行的协程时，从其他线程的就绪队列中窃取一个协程。

void work_stealing(nty_thread_schedule *ts) {
    int current_thread_id = get_thread_id();
    for (int i = 0; i < ts->thread_count; i++) {
        if (i == current_thread_id) continue;
        
        nty_coroutine *stolen = dequeue_ready(ts->steal_queues[i]);
        if (stolen) {
            enqueue_ready(&ts->thread_schedules[current_thread_id]->ready, stolen);
            ts->thread_loads[current_thread_id]++;
            break;
        }
    }
}

3.7 负载均衡

通过监控各线程的负载情况，动态调整协程的分配，确保各线程负载均衡。

负载均衡实现

void balance_load(nty_thread_schedule *ts) {
    pthread_mutex_lock(&ts->balance_lock);
    
    uint32_t total_load = 0;
    for (int i = 0; i < ts->thread_count; i++) {
        total_load += ts->thread_loads[i];
    }
    uint32_t avg_load = (ts->thread_count > 0) ? (total_load / ts->thread_count) : 0;
    
    for (int i = 0; i < ts->thread_count; i++) {
        if (ts->thread_loads[i] > avg_load * 1.2) { // 超过平均负载20%
            migrate_tasks(ts, i, avg_load);
        }
    }
    
    pthread_mutex_unlock(&ts->balance_lock);
}

迁移任务的实现

从负载较高的线程迁移部分协程到负载较低的线程。

void migrate_tasks(nty_thread_schedule *ts, int source_thread, uint32_t avg_load) {
    nty_schedule *source_sched = ts->thread_schedules[source_thread];
    
    while (ts->thread_loads[source_thread] > avg_load * 1.2) {
        nty_coroutine *co = dequeue_ready(&source_sched->ready);
        if (!co) break;
        
        int target_thread = find_least_loaded_thread(ts);
        enqueue_ready(&ts->thread_schedules[target_thread]->ready, co);
        ts->thread_loads[source_thread]--;
        ts->thread_loads[target_thread]++;
    }
}

3.8 其他优化

3.8.1 无锁队列实现

为了进一步减少锁竞争，可以实现无锁队列。例如，使用原子操作实现双端队列（Deque），支持高效的任务窃取。

typedef struct _nty_lockfree_queue {
    atomic_uint head;
    atomic_uint tail;
    nty_coroutine **buffer;
    size_t size;
} nty_lockfree_queue;

// 初始化无锁队列
nty_lockfree_queue* lockfree_queue_create(size_t size) {
    nty_lockfree_queue *q = malloc(sizeof(nty_lockfree_queue));
    atomic_init(&q->head, 0);
    atomic_init(&q->tail, 0);
    q->buffer = calloc(size, sizeof(nty_coroutine*));
    q->size = size;
    return q;
}

// 入队
bool lockfree_enqueue(nty_lockfree_queue *q, nty_coroutine *co) {
    unsigned int tail = atomic_load(&q->tail);
    unsigned int next_tail = (tail + 1) % q->size;
    if (next_tail == atomic_load(&q->head)) {
        return false; // 队列满
    }
    q->buffer[tail] = co;
    atomic_store(&q->tail, next_tail);
    return true;
}

// 出队
nty_coroutine* lockfree_dequeue(nty_lockfree_queue *q) {
    unsigned int head = atomic_load(&q->head);
    if (head == atomic_load(&q->tail)) {
        return NULL; // 队列空
    }
    nty_coroutine *co = q->buffer[head];
    atomic_store(&q->head, (head + 1) % q->size);
    return co;
}

3.8.2 协程亲和性

通过实现协程亲和性，将相关协程尽量调度到同一线程，以提高缓存命中率和减少上下文切换开销。

void set_coroutine_affinity(nty_coroutine *co, int thread_id) {
    co->affinity = thread_id;
}

nty_coroutine* select_coroutine_with_affinity(nty_thread_schedule *ts) {
    int current_thread_id = get_thread_id();
    // 优先选择具有当前线程亲和性的协程
    nty_schedule *sched = ts->thread_schedules[current_thread_id];
    for (int i = sched->ready.head; i != sched->ready.tail; i = (i + 1) % sched->ready.capacity) {
        if (sched->ready.coroutines[i]->affinity == current_thread_id) {
            return dequeue_ready(&sched->ready);
        }
    }
    return NULL;
}

3.8.3 协程优先级支持

通过为协程分配优先级，可以优先调度高优先级的协程，提高关键任务的响应速度。

typedef struct _nty_coroutine {
    // 其他成员
    int priority;
} nty_coroutine;

// 修改就绪队列为优先队列
void enqueue_ready(nty_coroutine_queue *queue, nty_coroutine *co) {
    pthread_mutex_lock(&queue->lock);
    // 简单插入排序，根据优先级插入
    size_t i = queue->tail;
    while (i > queue->head && queue->coroutines[i - 1]->priority < co->priority) {
        queue->coroutines[i] = queue->coroutines[i - 1];
        i--;
    }
    queue->coroutines[i] = co;
    queue->tail = (queue->tail + 1) % queue->capacity;
    pthread_mutex_unlock(&queue->lock);
}

nty_coroutine* dequeue_ready(nty_coroutine_queue *queue) {
    pthread_mutex_lock(&queue->lock);
    if (queue->head == queue->tail) {
        pthread_mutex_unlock(&queue->lock);
        return NULL; // 队列空
    }
    nty_coroutine *co = queue->coroutines[queue->head];
    queue->head = (queue->head + 1) % queue->capacity;
    pthread_mutex_unlock(&queue->lock);
    return co;
}

3.8.4 协程复用与池化

通过实现协程池，复用已完成的协程结构，减少协程创建和销毁的开销。

typedef struct _nty_coroutine_pool {
    nty_coroutine **pool;
    size_t size;
    size_t capacity;
    pthread_mutex_t lock;
} nty_coroutine_pool;

// 创建协程池
nty_coroutine_pool* coroutine_pool_create(size_t capacity) {
    nty_coroutine_pool *pool = malloc(sizeof(nty_coroutine_pool));
    pool->pool = malloc(sizeof(nty_coroutine*) * capacity);
    pool->size = 0;
    pool->capacity = capacity;
    pthread_mutex_init(&pool->lock, NULL);
    return pool;
}

// 获取协程
nty_coroutine* coroutine_pool_get(nty_coroutine_pool *pool) {
    pthread_mutex_lock(&pool->lock);
    nty_coroutine *co = NULL;
    if (pool->size > 0) {
        co = pool->pool[--pool->size];
    }
    pthread_mutex_unlock(&pool->lock);
    return co;
}

// 归还协程
void coroutine_pool_put(nty_coroutine_pool *pool, nty_coroutine *co) {
    pthread_mutex_lock(&pool->lock);
    if (pool->size < pool->capacity) {
        pool->pool[pool->size++] = co;
    } else {
        free(co->stack);
        free(co);
    }
    pthread_mutex_unlock(&pool->lock);
}

4. 完整的多核协程调度器示例

以下是一个简化的、多线程多核支持的协程调度器的完整示例，结合了上述各个部分。

#include <pthread.h>
#include <ucontext.h>
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdatomic.h>
#include <string.h>

// 定义常量
#define STACK_SIZE (1024 * 1024)
#define NTY_CO_MAX_EVENTS 1024

// 协程结构
typedef struct _nty_coroutine {
    uint64_t birth;
    ucontext_t ctx;
    void *stack;
    size_t stack_size;
    int priority;
    int affinity; // 协程亲和性
} nty_coroutine;

// 就绪队列结构
typedef struct _nty_coroutine_queue {
    nty_coroutine **coroutines;
    size_t head;
    size_t tail;
    size_t capacity;
    pthread_mutex_t lock;
} nty_coroutine_queue;

// 调度器结构
typedef struct _nty_schedule {
    uint64_t birth;
    ucontext_t ctx;
    void *stack;
    size_t stack_size;
    int spawned_coroutines;
    
    // 调度相关队列
    nty_coroutine_queue ready;
    // 其他队列...
} nty_schedule;

// 多线程调度器结构
typedef struct _nty_thread_schedule {
    int thread_count;
    pthread_t *thread_ids;
    nty_schedule **thread_schedules;
    
    // 任务窃取相关
    pthread_mutex_t steal_lock;
    nty_coroutine_queue *steal_queues;
    
    // 负载均衡相关
    pthread_mutex_t balance_lock;
    uint32_t *thread_loads;
} nty_thread_schedule;

// 全局线程调度器
nty_thread_schedule *global_thread_schedule;

// 协程队列操作函数
nty_coroutine_queue* queue_create(size_t capacity) {
    nty_coroutine_queue *queue = malloc(sizeof(nty_coroutine_queue));
    queue->coroutines = malloc(sizeof(nty_coroutine*) * capacity);
    queue->head = 0;
    queue->tail = 0;
    queue->capacity = capacity;
    pthread_mutex_init(&queue->lock, NULL);
    return queue;
}

bool enqueue_ready(nty_coroutine_queue *queue, nty_coroutine *co) {
    pthread_mutex_lock(&queue->lock);
    if ((queue->tail + 1) % queue->capacity == queue->head) {
        pthread_mutex_unlock(&queue->lock);
        return false; // 队列满
    }
    // 插入协程，根据优先级排序（高优先级在前）
    size_t i = queue->tail;
    while (i > queue->head && queue->coroutines[i - 1]->priority < co->priority) {
        queue->coroutines[i] = queue->coroutines[i - 1];
        i--;
    }
    queue->coroutines[i] = co;
    queue->tail = (queue->tail + 1) % queue->capacity;
    pthread_mutex_unlock(&queue->lock);
    return true;
}

nty_coroutine* dequeue_ready(nty_coroutine_queue *queue) {
    pthread_mutex_lock(&queue->lock);
    if (queue->head == queue->tail) {
        pthread_mutex_unlock(&queue->lock);
        return NULL; // 队列空
    }
    nty_coroutine *co = queue->coroutines[queue->head];
    queue->head = (queue->head + 1) % queue->capacity;
    pthread_mutex_unlock(&queue->lock);
    return co;
}

// 获取当前线程ID
int get_thread_id() {
    // 使用线程局部存储存储线程ID
    static __thread int thread_id = -1;
    if (thread_id == -1) {
        pthread_mutex_lock(&global_thread_schedule->steal_lock);
        for (int i = 0; i < global_thread_schedule->thread_count; i++) {
            if (pthread_equal(pthread_self(), global_thread_schedule->thread_ids[i])) {
                thread_id = i;
                break;
            }
        }
        pthread_mutex_unlock(&global_thread_schedule->steal_lock);
    }
    return thread_id;
}

// 执行协程
void execute_coroutine(nty_coroutine *co) {
    nty_schedule *sched = global_thread_schedule->thread_schedules[get_thread_id()];
    if (swapcontext(&sched->ctx, &co->ctx) == -1) {
        perror("swapcontext failed");
        exit(EXIT_FAILURE);
    }
}

// 工作窃取实现
void work_stealing(nty_thread_schedule *ts) {
    int current_thread_id = get_thread_id();
    for (int i = 0; i < ts->thread_count; i++) {
        if (i == current_thread_id) continue;
        
        nty_coroutine *stolen = dequeue_ready(ts->steal_queues[i]);
        if (stolen) {
            enqueue_ready(&ts->thread_schedules[current_thread_id]->ready, stolen);
            ts->thread_loads[current_thread_id]++;
            break;
        }
    }
}

// 负载均衡实现
void migrate_tasks(nty_thread_schedule *ts, int source_thread, uint32_t avg_load) {
    nty_schedule *source_sched = ts->thread_schedules[source_thread];
    
    while (ts->thread_loads[source_thread] > avg_load * 1.2) {
        nty_coroutine *co = dequeue_ready(&source_sched->ready);
        if (!co) break;
        
        int target_thread = find_least_loaded_thread(ts);
        enqueue_ready(&ts->thread_schedules[target_thread]->ready, co);
        ts->thread_loads[source_thread]--;
        ts->thread_loads[target_thread]++;
    }
}

void balance_load(nty_thread_schedule *ts) {
    pthread_mutex_lock(&ts->balance_lock);
    
    uint32_t total_load = 0;
    for (int i = 0; i < ts->thread_count; i++) {
        total_load += ts->thread_loads[i];
    }
    uint32_t avg_load = (ts->thread_count > 0) ? (total_load / ts->thread_count) : 0;
    
    for (int i = 0; i < ts->thread_count; i++) {
        if (ts->thread_loads[i] > avg_load * 1.2) { // 超过平均负载20%
            migrate_tasks(ts, i, avg_load);
        }
    }
    
    pthread_mutex_unlock(&ts->balance_lock);
}

// 迁移任务
void migrate_tasks(nty_thread_schedule *ts, int source_thread, uint32_t avg_load) {
    nty_schedule *source_sched = ts->thread_schedules[source_thread];
    
    while (ts->thread_loads[source_thread] > avg_load * 1.2) {
        nty_coroutine *co = dequeue_ready(&source_sched->ready);
        if (!co) break;
        
        int target_thread = find_least_loaded_thread(ts);
        enqueue_ready(&ts->thread_schedules[target_thread]->ready, co);
        ts->thread_loads[source_thread]--;
        ts->thread_loads[target_thread]++;
    }
}

// 查找负载最小的线程
int find_least_loaded_thread(nty_thread_schedule *ts) {
    uint32_t min_load = UINT32_MAX;
    int target_thread = 0;
    for (int i = 0; i < ts->thread_count; i++) {
        if (ts->thread_loads[i] < min_load) {
            min_load = ts->thread_loads[i];
            target_thread = i;
        }
    }
    return target_thread;
}

// 调度线程函数
void* schedule_thread_func(void *arg) {
    nty_thread_schedule *ts = (nty_thread_schedule*)arg;
    int thread_id = get_thread_id();
    nty_schedule *sched = ts->thread_schedules[thread_id];
    
    while (1) {
        // 获取下一个协程
        nty_coroutine *co = dequeue_ready(&sched->ready);
        if (co) {
            ts->thread_loads[thread_id]++;
            execute_coroutine(co); // 执行协程的上下文切换
            ts->thread_loads[thread_id]--;
        } else {
            // 如果没有可执行的协程，尝试工作窃取
            work_stealing(ts);
        }

        // 定期进行负载均衡
        balance_load(ts);
        
        // 处理其他调度事件，如协程睡眠、等待等
        // handle_scheduling_events(sched);
    }
    
    return NULL;
}

// 创建多线程调度器
nty_thread_schedule* nty_thread_schedule_create(int thread_count) {
    nty_thread_schedule *ts = calloc(1, sizeof(nty_thread_schedule));
    ts->thread_count = thread_count;
    ts->thread_ids = calloc(thread_count, sizeof(pthread_t));
    ts->thread_schedules = calloc(thread_count, sizeof(nty_schedule*));
    ts->steal_queues = calloc(thread_count, sizeof(nty_coroutine_queue*));
    ts->thread_loads = calloc(thread_count, sizeof(uint32_t));
    pthread_mutex_init(&ts->steal_lock, NULL);
    pthread_mutex_init(&ts->balance_lock, NULL);
    
    // 初始化每个线程的调度器和队列
    for (int i = 0; i < thread_count; i++) {
        ts->thread_schedules[i] = malloc(sizeof(nty_schedule));
        // 初始化调度器的其他成员...
        ts->steal_queues[i] = queue_create(1024);
        pthread_create(&ts->thread_ids[i], NULL, schedule_thread_func, ts);
    }
    
    return ts;
}

// 创建协程
nty_coroutine* nty_coroutine_create(void (*func)(void *), void *arg) {
    nty_coroutine *co = malloc(sizeof(nty_coroutine));
    // 初始化协程上下文、栈等
    getcontext(&co->ctx);
    co->stack = malloc(STACK_SIZE);
    co->stack_size = STACK_SIZE;
    co->ctx.uc_stack.ss_sp = co->stack;
    co->ctx.uc_stack.ss_size = STACK_SIZE;
    co->ctx.uc_link = &global_thread_schedule->thread_schedules[0]->ctx; // 切回主调度器上下文
    makecontext(&co->ctx, (void (*)(void))func, 1, arg);
    co->priority = 0; // 默认优先级
    co->affinity = -1; // 默认无亲和性
    
    // 将协程分配到负载最小的线程
    nty_thread_schedule *ts = global_thread_schedule;
    int target_thread = find_least_loaded_thread(ts);
    enqueue_ready(&ts->thread_schedules[target_thread]->ready, co);
    ts->thread_loads[target_thread]++;
    
    return co;
}

// 示例协程函数
void example_coroutine_func(void *arg) {
    printf("协程开始执行: %s\n", (char*)arg);
    // 执行协程任务...
    // 切回调度器上下文
    // 这里假设协程任务完成后自动返回
}

// 主函数示例
int main() {
    // 创建多线程调度器
    global_thread_schedule = nty_thread_schedule_create(4); // 例如4个线程
    
    // 创建示例协程
    nty_coroutine_create(example_coroutine_func, "协程1");
    nty_coroutine_create(example_coroutine_func, "协程2");
    nty_coroutine_create(example_coroutine_func, "协程3");
    nty_coroutine_create(example_coroutine_func, "协程4");
    
    // 主线程参与调度
    while (1) {
        // 主线程也可以作为调度线程的一部分
        // 或者单独管理主协程
    }
    
    return 0;
}