Kernel 工作队列

本文介绍了内核中工作队列(workqueue)的基本概念、数据结构、工作流程及系统默认创建情况。工作队列是将任务推后执行的机制,基于内核线程实现。文中给出了相关接口,分析了工作队列、工作、工人等数据结构,还介绍了系统默认创建的工作队列和工人,简化了使用方法。

    1,基本概念:

    工作队列(workqueue),是内核中一种将任务推后执行的机制。与软中断,tasklet不同的是,工作队列基于内核线程实现,能够休眠。下面,我们首先了解工作队列的几个概念。

    (1)work:工作。将要被调度执行的一个任务。

    (2)worker:工人。实际调度执行任务的实体。

    (3)workqueue:工作队列。包含了一系列工作和工人的对象。

    进一步,我们再看看内核提供的有关工作队列的接口:

    (1)创建工作队列:create_workqueue

                用来创建一个工作队列 workqueue。

    (2)创建工人:create_worker

                用来创建一个执行工作的工人worker。

    (3)调度工作:schedule_work

                用户初始化一个work,通过schedule_work接口将work加入工作队列,并调度工作队列执行该任务。

        由上所述,我们可以看到工作队列的本质:

        (1)首先,创建好一个workqueue,并创建相应的workers,将这些workers挂接到workqueue中。

        (2)创建一个work,调用schedule_work()接口将work加入workqueue队列,并调度workqueue中的worker执行。

        (3)每个worker是一个内核线程,在内核线程内部执行work的处理函数。

    2,数据结构:

    (1)work_struct:

struct work_struct {
    atomic_long_t data;        // 工作处理函数数据
    struct list_head entry;    // 链表节点,挂在workqueue上,其实是挂在 worker_pool上
    work_func_t func;    // 工作处理函数
#ifdef CONFIG_LOCKDEP
    struct lockdep_map lockdep_map;
#endif
};

    (2)worker:

struct worker {
    /* on idle list while idle, on busy hash table while busy */
    union {
        struct list_head    entry;    /* L: while idle */
        struct hlist_node    hentry;    /* L: while busy */
    };    // 链表节点,挂接在idle/busy链表的节点

    struct work_struct    *current_work;    /* L: work being processed */    // 当前调度处理的工作
    work_func_t        current_func;    /* L: current_work's fn */    // 当前的处理函数
    struct pool_workqueue    *current_pwq; /* L: current_work's pwq */
    struct list_head    scheduled;    /* L: scheduled works */    // 当前被调度的所有工作

    /* 64 bytes boundary on 64bit, 32 on 32bit */

    struct task_struct    *task;        /* I: worker task */    // worker线程
    struct worker_pool    *pool;        /* A: the associated pool */    // worker线程池
                        /* L: for rescuers */
    struct list_head    node;        /* A: anchored at pool->workers */    // 挂接在worker_pool下的节点
                        /* A: runs through worker->node */

    unsigned long        last_active;    /* L: last active timestamp */
    unsigned int        flags;        /* X: flags */
    int            id;        /* I: worker id */

    /*
     * Opaque string set with work_set_desc().  Printed out with task
     * dump for debugging - WARN, BUG, panic or sysrq.
     */
    char            desc[WORKER_DESC_LEN];

    /* used only by rescuers to point to the target workqueue */
    struct workqueue_struct    *rescue_wq;    /* I: the workqueue to rescue */
};

    (3)workqueue_struct:

struct workqueue_struct {
    struct list_head    pwqs;        /* WR: all pwqs of this wq */
    struct list_head    list;        /* PR: list of all workqueues */    //链表节点,挂接在系统全局workqueue链表上

    struct mutex        mutex;        /* protects this wq */
    int            work_color;    /* WQ: current work color */
    int            flush_color;    /* WQ: current flush color */
    atomic_t        nr_pwqs_to_flush; /* flush in progress */
    struct wq_flusher    *first_flusher;    /* WQ: first flusher */
    struct list_head    flusher_queue;    /* WQ: flush waiters */
    struct list_head    flusher_overflow; /* WQ: flush overflow list */

    struct list_head    maydays;    /* MD: pwqs requesting rescue */
    struct worker        *rescuer;    /* I: rescue worker */

    int            nr_drainers;    /* WQ: drain in progress */
    int            saved_max_active; /* WQ: saved pwq max_active */

    struct workqueue_attrs    *unbound_attrs;    /* PW: only for unbound wqs */
    struct pool_workqueue    *dfl_pwq;    /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
    struct wq_device    *wq_dev;    /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
    struct lockdep_map    lockdep_map;
#endif
    char            name[WQ_NAME_LEN]; /* I: workqueue name */    // workqueue名字

    /*
     * Destruction of workqueue_struct is sched-RCU protected to allow
     * walking the workqueues list without grabbing wq_pool_mutex.
     * This is used to dump all workqueues from sysrq.
     */
    struct rcu_head        rcu;

    /* hot fields used during command issue, aligned to cacheline */
    unsigned int        flags ____cacheline_aligned; /* WQ: WQ_* flags */
    struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
    struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};

    (4)worker_pool:

struct worker_pool {
    spinlock_t        lock;        /* the pool lock */
    int            cpu;        /* I: the associated cpu */
    int            node;        /* I: the associated node ID */
    int            id;        /* I: pool ID */
    unsigned int        flags;        /* X: flags */

    unsigned long        watchdog_ts;    /* L: watchdog timestamp */

    struct list_head    worklist;    /* L: list of pending works */    // 挂接在该worker_pool下的所有work

    int            nr_workers;    /* L: total number of workers */    // 该worker_pool下所有worker的个数
    int            nr_idle;    /* L: currently idle workers */    // 该worker_pool下空闲worker的个数

    struct list_head    idle_list;    /* X: list of idle workers */
    struct timer_list    idle_timer;    /* L: worker idle timeout */
    struct timer_list    mayday_timer;    /* L: SOS timer for workers */

    /* a workers is either on busy_hash or idle_list, or the manager */
    DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                        /* L: hash of busy workers */

    struct worker        *manager;    /* L: purely informational */
    struct list_head    workers;    /* A: attached workers */    // 该worker_pool下所有worker链表
    struct completion    *detach_completion; /* all workers detached */    //完成量,用于删除worker

    struct ida        worker_ida;    /* worker IDs for task name */

    struct workqueue_attrs    *attrs;        /* I: worker attributes */
    struct hlist_node    hash_node;    /* PL: unbound_pool_hash node */
    int            refcnt;        /* PL: refcnt for unbound pools */

    /*
     * The current concurrency level.  As it's likely to be accessed
     * from other CPUs during try_to_wake_up(), put it in a separate
     * cacheline.
     */
    atomic_t        nr_running ____cacheline_aligned_in_smp;

    /*
     * Destruction of pool is sched-RCU protected to allow dereferences
     * from get_work_pool().
     */
    struct rcu_head        rcu;
} ____cacheline_aligned_in_smp;

    数据结构和代码的内容比较多,但我们可以绕过这些细节,找到我们关心的核心字段和接口,理解了核心思想之后,再去扩展这些细节。

    3,接口实现:

    (1)创建workqueue:

#define create_workqueue(name)                        \
    alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))

#define alloc_workqueue(fmt, flags, max_active, args...)        \
    __alloc_workqueue_key((fmt), (flags), (max_active),        \
                  NULL, NULL, ##args)

struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                           unsigned int flags,
                           int max_active,
                           struct lock_class_key *key,
                           const char *lock_name, ...)
{
    size_t tbl_size = 0;
    va_list args;
    struct workqueue_struct *wq;
    struct pool_workqueue *pwq;

    /*
     * Unbound && max_active == 1 used to imply ordered, which is no
     * longer the case on NUMA machines due to per-node pools.  While
     * alloc_ordered_workqueue() is the right way to create an ordered
     * workqueue, keep the previous behavior to avoid subtle breakages
     * on NUMA.
     */
    if ((flags & WQ_UNBOUND) && max_active == 1)
        flags |= __WQ_ORDERED;

    /* see the comment above the definition of WQ_POWER_EFFICIENT */
    if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
        flags |= WQ_UNBOUND;

    /* allocate wq and format name */
    if (flags & WQ_UNBOUND)
        tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);

    wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);    // 分配workqueue_struct实例
    if (!wq)
        return NULL;

    if (flags & WQ_UNBOUND) {
        wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
        if (!wq->unbound_attrs)
            goto err_free_wq;
    }

    va_start(args, lock_name);
    vsnprintf(wq->name, sizeof(wq->name), fmt, args);    // 赋值workqueue_struct名字
    va_end(args);

    max_active = max_active ?: WQ_DFL_ACTIVE;
    max_active = wq_clamp_max_active(max_active, flags, wq->name);

    /* init wq */

    // 初始化 workqueue
    wq->flags = flags;
    wq->saved_max_active = max_active;
    mutex_init(&wq->mutex);
    atomic_set(&wq->nr_pwqs_to_flush, 0);
    INIT_LIST_HEAD(&wq->pwqs);
    INIT_LIST_HEAD(&wq->flusher_queue);
    INIT_LIST_HEAD(&wq->flusher_overflow);
    INIT_LIST_HEAD(&wq->maydays);

    lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
    INIT_LIST_HEAD(&wq->list);

    if (alloc_and_link_pwqs(wq) < 0)
        goto err_free_wq;

    if (wq_online && init_rescuer(wq) < 0)
        goto err_destroy;

    if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
        goto err_destroy;

    /*
     * wq_pool_mutex protects global freeze state and workqueues list.
     * Grab it, adjust max_active and add the new @wq to workqueues
     * list.
     */
    mutex_lock(&wq_pool_mutex);

    mutex_lock(&wq->mutex);
    for_each_pwq(pwq, wq)
        pwq_adjust_max_active(pwq);
    mutex_unlock(&wq->mutex);

    list_add_tail_rcu(&wq->list, &workqueues);    // 将workqueue加入全局链表 workqueues

    mutex_unlock(&wq_pool_mutex);

    return wq;

err_free_wq:
    free_workqueue_attrs(wq->unbound_attrs);
    kfree(wq);
    return NULL;
err_destroy:
    destroy_workqueue(wq);
    return NULL;
}

    (2)创建 worker:

static struct worker *create_worker(struct worker_pool *pool)
{
    struct worker *worker = NULL;
    int id = -1;
    char id_buf[16];

    /* ID is needed to determine kthread name */
    id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);    // 分配worker_pool下的worker id
    if (id < 0)
        goto fail;

    worker = alloc_worker(pool->node);    // 创建worker实例
    if (!worker)
        goto fail;

    worker->id = id;    // 赋值worker id

    if (pool->cpu >= 0)
        snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
             pool->attrs->nice < 0  ? "H" : "");
    else
        snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

    worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
                          "kworker/%s", id_buf);    // 创建worker的内核工作线程

    if (IS_ERR(worker->task))
        goto fail;

    set_user_nice(worker->task, pool->attrs->nice);
    kthread_bind_mask(worker->task, pool->attrs->cpumask);

    /* successful, attach the worker to the pool */
    worker_attach_to_pool(worker, pool);    // 将该worker挂接到 worker_pool的workers链表下

    /* start the newly created worker */
    spin_lock_irq(&pool->lock);
    worker->pool->nr_workers++;    // 更新相应worker_pool下的worker个数
    worker_enter_idle(worker);    // 将该worker挂接到 idle链表下
    wake_up_process(worker->task);    // 唤醒该worker内核线程,执行worker_thread
    spin_unlock_irq(&pool->lock);

    return worker;

fail:
    if (id >= 0)
        ida_simple_remove(&pool->worker_ida, id);
    kfree(worker);
    return NULL;
}

    (3)调度work:

static inline bool schedule_work(struct work_struct *work)
{
    return queue_work(system_wq, work);    // 默认将work加入 system_wq 下,system_wq是系统启动时自动创建的workqueue。如果要调度用户自定义的workqueue,可以直接使用 queue_work 接口。
}

static inline bool queue_work(struct workqueue_struct *wq,
                  struct work_struct *work)
{
    return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

bool queue_work_on(int cpu, struct workqueue_struct *wq,
           struct work_struct *work)
{
    bool ret = false;
    unsigned long flags;

    local_irq_save(flags);

    if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
        __queue_work(cpu, wq, work);
        ret = true;
    }

    local_irq_restore(flags);
    return ret;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
             struct work_struct *work)

{
    struct pool_workqueue *pwq;
    struct worker_pool *last_pool;
    struct list_head *worklist;
    unsigned int work_flags;
    unsigned int req_cpu = cpu;

    /*
     * While a work item is PENDING && off queue, a task trying to
     * steal the PENDING will busy-loop waiting for it to either get
     * queued or lose PENDING.  Grabbing PENDING and queueing should
     * happen with IRQ disabled.
     */
    lockdep_assert_irqs_disabled();

    debug_work_activate(work);

    /* if draining, only works from the same workqueue are allowed */
    if (unlikely(wq->flags & __WQ_DRAINING) &&
        WARN_ON_ONCE(!is_chained_work(wq)))
        return;
retry:
    if (req_cpu == WORK_CPU_UNBOUND)
        cpu = wq_select_unbound_cpu(raw_smp_processor_id());

    /* pwq which will be used unless @work is executing elsewhere */
    if (!(wq->flags & WQ_UNBOUND))
        pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
    else
        pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));

    /*
     * If @work was previously on a different pool, it might still be
     * running there, in which case the work needs to be queued on that
     * pool to guarantee non-reentrancy.
     */
    last_pool = get_work_pool(work);    // 找到合适的 worker_pool
    if (last_pool && last_pool != pwq->pool) {
        struct worker *worker;

        spin_lock(&last_pool->lock);

        worker = find_worker_executing_work(last_pool, work);    // 找到合适的worker

        if (worker && worker->current_pwq->wq == wq) {
            pwq = worker->current_pwq;
        } else {
            /* meh... not running there, queue here */
            spin_unlock(&last_pool->lock);
            spin_lock(&pwq->pool->lock);
        }
    } else {
        spin_lock(&pwq->pool->lock);
    }

    /*
     * pwq is determined and locked.  For unbound pools, we could have
     * raced with pwq release and it could already be dead.  If its
     * refcnt is zero, repeat pwq selection.  Note that pwqs never die
     * without another pwq replacing it in the numa_pwq_tbl or while
     * work items are executing on it, so the retrying is guaranteed to
     * make forward-progress.
     */
    if (unlikely(!pwq->refcnt)) {
        if (wq->flags & WQ_UNBOUND) {
            spin_unlock(&pwq->pool->lock);
            cpu_relax();
            goto retry;
        }
        /* oops */
        WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
              wq->name, cpu);
    }

    /* pwq determined, queue */
    trace_workqueue_queue_work(req_cpu, pwq, work);

    if (WARN_ON(!list_empty(&work->entry))) {
        spin_unlock(&pwq->pool->lock);
        return;
    }

    pwq->nr_in_flight[pwq->work_color]++;
    work_flags = work_color_to_flags(pwq->work_color);

    if (likely(pwq->nr_active < pwq->max_active)) {
        trace_workqueue_activate_work(work);
        pwq->nr_active++;
        worklist = &pwq->pool->worklist;
        if (list_empty(worklist))
            pwq->pool->watchdog_ts = jiffies;
    } else {
        work_flags |= WORK_STRUCT_DELAYED;
        worklist = &pwq->delayed_works;
    }

    insert_work(pwq, work, worklist, work_flags);    // 将work加入worker_pool的worklist中

    spin_unlock(&pwq->pool->lock);
}

static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
            struct list_head *head, unsigned int extra_flags)
{
    struct worker_pool *pool = pwq->pool;

    /* we own @work, set data and link */
    set_work_pwq(work, pwq, extra_flags);
    list_add_tail(&work->entry, head);    // 加入worker_pool链表
    get_pwq(pwq);

    /*
     * Ensure either wq_worker_sleeping() sees the above
     * list_add_tail() or we see zero nr_running to avoid workers lying
     * around lazily while there are works to be processed.
     */
    smp_mb();

    if (__need_more_worker(pool))
        wake_up_worker(pool);    // 调度worker_pool执行worker
}

static void wake_up_worker(struct worker_pool *pool)
{
    struct worker *worker = first_idle_worker(pool);

    if (likely(worker))
        wake_up_process(worker->task);    // 执行注册的worker内核线程:worker_thread
}

    下面我们看看最终的 worker_thread函数具体如何执行的:

static int worker_thread(void *__worker)
{
    struct worker *worker = __worker;
    struct worker_pool *pool = worker->pool;    // 找到worker所属的worker_pool

    /* tell the scheduler that this is a workqueue worker */
    set_pf_worker(true);
woke_up:
    spin_lock_irq(&pool->lock);

    /* am I supposed to die? */
    if (unlikely(worker->flags & WORKER_DIE)) {
        spin_unlock_irq(&pool->lock);
        WARN_ON_ONCE(!list_empty(&worker->entry));
        set_pf_worker(false);

        set_task_comm(worker->task, "kworker/dying");
        ida_simple_remove(&pool->worker_ida, worker->id);
        worker_detach_from_pool(worker);
        kfree(worker);
        return 0;
    }

    worker_leave_idle(worker);
recheck:
    /* no more worker necessary? */
    if (!need_more_worker(pool))
        goto sleep;

    /* do we need to manage? */
    if (unlikely(!may_start_working(pool)) && manage_workers(worker))
        goto recheck;

    /*
     * ->scheduled list can only be filled while a worker is
     * preparing to process a work or actually processing it.
     * Make sure nobody diddled with it while I was sleeping.
     */
    WARN_ON_ONCE(!list_empty(&worker->scheduled));

    /*
     * Finish PREP stage.  We're guaranteed to have at least one idle
     * worker or that someone else has already assumed the manager
     * role.  This is where @worker starts participating in concurrency
     * management if applicable and concurrency management is restored
     * after being rebound.  See rebind_workers() for details.
     */
    worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

    do {    // 循环处理 pool下面worklist中的所有work
        struct work_struct *work =
            list_first_entry(&pool->worklist,
                     struct work_struct, entry);    // 取出首个work

        pool->watchdog_ts = jiffies;

        if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
            /* optimization path, not strictly necessary */
            process_one_work(worker, work);    // 实际执行用户注册的处理函数
            if (unlikely(!list_empty(&worker->scheduled)))
                process_scheduled_works(worker);
        } else {
            move_linked_works(work, &worker->scheduled, NULL);
            process_scheduled_works(worker);
        }
    } while (keep_working(pool));

    worker_set_flags(worker, WORKER_PREP);
sleep:
    /*
     * pool->lock is held and there's no work to process and no need to
     * manage, sleep.  Workers are woken up only while holding
     * pool->lock or from local cpu, so setting the current state
     * before releasing pool->lock is enough to prevent losing any
     * event.
     */
    worker_enter_idle(worker);
    __set_current_state(TASK_IDLE);
    spin_unlock_irq(&pool->lock);
    schedule();
    goto woke_up;
}

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
    struct pool_workqueue *pwq = get_work_pwq(work);
    struct worker_pool *pool = worker->pool;
    bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
    int work_color;
    struct worker *collision;
#ifdef CONFIG_LOCKDEP
    /*
     * It is permissible to free the struct work_struct from
     * inside the function that is called from it, this we need to
     * take into account for lockdep too.  To avoid bogus "held
     * lock freed" warnings as well as problems when looking into
     * work->lockdep_map, make a copy and use that here.
     */
    struct lockdep_map lockdep_map;

    lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
    /* ensure we're on the correct CPU */
    WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
             raw_smp_processor_id() != pool->cpu);

    /*
     * A single work shouldn't be executed concurrently by
     * multiple workers on a single cpu.  Check whether anyone is
     * already processing the work.  If so, defer the work to the
     * currently executing one.
     */
    collision = find_worker_executing_work(pool, work);
    if (unlikely(collision)) {
        move_linked_works(work, &collision->scheduled, NULL);
        return;
    }

    /* claim and dequeue */
    debug_work_deactivate(work);
    hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
    worker->current_work = work;
    worker->current_func = work->func;
    worker->current_pwq = pwq;
    work_color = get_work_color(work);    // 初始化 worker 的工作内容

    /*
     * Record wq name for cmdline and debug reporting, may get
     * overridden through set_worker_desc().
     */
    strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

    list_del_init(&work->entry);

    /*
     * CPU intensive works don't participate in concurrency management.
     * They're the scheduler's responsibility.  This takes @worker out
     * of concurrency management and the next code block will chain
     * execution of the pending work items.
     */
    if (unlikely(cpu_intensive))
        worker_set_flags(worker, WORKER_CPU_INTENSIVE);

    /*
     * Wake up another worker if necessary.  The condition is always
     * false for normal per-cpu workers since nr_running would always
     * be >= 1 at this point.  This is used to chain execution of the
     * pending work items for WORKER_NOT_RUNNING workers such as the
     * UNBOUND and CPU_INTENSIVE ones.
     */
    if (need_more_worker(pool))
        wake_up_worker(pool);

    /*
     * Record the last pool and clear PENDING which should be the last
     * update to @work.  Also, do this inside @pool->lock so that
     * PENDING and queued state changes happen together while IRQ is
     * disabled.
     */
    set_work_pool_and_clear_pending(work, pool->id);

    spin_unlock_irq(&pool->lock);

    lock_map_acquire(&pwq->wq->lockdep_map);
    lock_map_acquire(&lockdep_map);
    /*
     * Strictly speaking we should mark the invariant state without holding
     * any locks, that is, before these two lock_map_acquire()'s.
     *
     * However, that would result in:
     *
     *   A(W1)
     *   WFC(C)
     *        A(W1)
     *        C(C)
     *
     * Which would create W1->C->W1 dependencies, even though there is no
     * actual deadlock possible. There are two solutions, using a
     * read-recursive acquire on the work(queue) 'locks', but this will then
     * hit the lockdep limitation on recursive locks, or simply discard
     * these locks.
     *
     * AFAICT there is no possible deadlock scenario between the
     * flush_work() and complete() primitives (except for single-threaded
     * workqueues), so hiding them isn't a problem.
     */
    lockdep_invariant_state(true);
    trace_workqueue_execute_start(work);
    worker->current_func(work);    // 最终调度用户注册的处理函数
    /*
     * While we must be careful to not use "work" after this, the trace
     * point will only record its address.
     */
    trace_workqueue_execute_end(work);
    lock_map_release(&lockdep_map);
    lock_map_release(&pwq->wq->lockdep_map);

    if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
        pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
               "     last function: %pf\n",
               current->comm, preempt_count(), task_pid_nr(current),
               worker->current_func);
        debug_show_held_locks(current);
        dump_stack();
    }

    /*
     * The following prevents a kworker from hogging CPU on !PREEMPT
     * kernels, where a requeueing work item waiting for something to
     * happen could deadlock with stop_machine as such work item could
     * indefinitely requeue itself while all other CPUs are trapped in
     * stop_machine. At the same time, report a quiescent RCU state so
     * the same condition doesn't freeze RCU.
     */
    cond_resched();

    spin_lock_irq(&pool->lock);

    /* clear cpu intensive status */
    if (unlikely(cpu_intensive))
        worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

    /* we're done with it, release */
    hash_del(&worker->hentry);
    worker->current_work = NULL;
    worker->current_func = NULL;
    worker->current_pwq = NULL;
    pwq_dec_nr_in_flight(pwq, work_color);
}

    到目前为止,我们从宏观的角度已经分析了workqueue的整体工作流程。但是还有一点我们也需要了解,就是系统在初始化的过程中,给我们创建了几个默认的 workqueue,使得我们不必再去主动创建workqueue和worker。在使用这些 workqueue的时候,我们只需要调用 schedule_work(默认注册到system_wq)或者queue_work(指定系统中已经存在的workqueue)接口就好。这大大的简化了workqueue的使用方法,使得用户编程更加便捷。

    (1)系统默认创建的workqueue:

int __init workqueue_init_early(void)
{
    int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
    int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
    int i, cpu;

    WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

    BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
    cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));

    pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

    /* initialize CPU pools */
    for_each_possible_cpu(cpu) {
        struct worker_pool *pool;

        i = 0;
        for_each_cpu_worker_pool(pool, cpu) {
            BUG_ON(init_worker_pool(pool));
            pool->cpu = cpu;
            cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
            pool->attrs->nice = std_nice[i++];
            pool->node = cpu_to_node(cpu);

            /* alloc pool ID */
            mutex_lock(&wq_pool_mutex);
            BUG_ON(worker_pool_assign_id(pool));
            mutex_unlock(&wq_pool_mutex);
        }
    }

    /* create default unbound and ordered wq attrs */
    for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
        struct workqueue_attrs *attrs;

        BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
        attrs->nice = std_nice[i];
        unbound_std_wq_attrs[i] = attrs;

        /*
         * An ordered wq should have only one pwq as ordering is
         * guaranteed by max_active which is enforced by pwqs.
         * Turn off NUMA so that dfl_pwq is used for all nodes.
         */
        BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
        attrs->nice = std_nice[i];
        attrs->no_numa = true;
        ordered_wq_attrs[i] = attrs;
    }

    // 以下是系统默认创建的workqueue

    system_wq = alloc_workqueue("events", 0, 0);
    system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
    system_long_wq = alloc_workqueue("events_long", 0, 0);
    system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                        WQ_UNBOUND_MAX_ACTIVE);
    system_freezable_wq = alloc_workqueue("events_freezable",
                          WQ_FREEZABLE, 0);
    system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                          WQ_POWER_EFFICIENT, 0);
    system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
                          WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                          0);

    BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
           !system_unbound_wq || !system_freezable_wq ||
           !system_power_efficient_wq ||
           !system_freezable_power_efficient_wq);

    return 0;
}

    (2)系统默认创建的 worker:

int __init workqueue_init(void)
{
    struct workqueue_struct *wq;
    struct worker_pool *pool;
    int cpu, bkt;

    /*
     * It'd be simpler to initialize NUMA in workqueue_init_early() but
     * CPU to node mapping may not be available that early on some
     * archs such as power and arm64.  As per-cpu pools created
     * previously could be missing node hint and unbound pools NUMA
     * affinity, fix them up.
     *
     * Also, while iterating workqueues, create rescuers if requested.
     */
    wq_numa_init();

    mutex_lock(&wq_pool_mutex);

    for_each_possible_cpu(cpu) {
        for_each_cpu_worker_pool(pool, cpu) {
            pool->node = cpu_to_node(cpu);
        }
    }

    list_for_each_entry(wq, &workqueues, list) {
        wq_update_unbound_numa(wq, smp_processor_id(), true);
        WARN(init_rescuer(wq),
             "workqueue: failed to create early rescuer for %s",
             wq->name);
    }

    mutex_unlock(&wq_pool_mutex);

    /* create the initial workers */
    for_each_online_cpu(cpu) {
        for_each_cpu_worker_pool(pool, cpu) {
            pool->flags &= ~POOL_DISASSOCIATED;
            BUG_ON(!create_worker(pool));    // 创建默认worker
        }
    }

    hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
        BUG_ON(!create_worker(pool));    // 创建默认worker

    wq_online = true;
    wq_watchdog_init();

    return 0;

 

转载于:https://my.oschina.net/yepanl/blog/3051288

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值