Kernel 工作队列

最新推荐文章于 2025-01-23 16:19:07 发布

转载最新推荐文章于 2025-01-23 16:19:07 发布 · 780 阅读

1 ·

CC 4.0 BY-SA版权

原文链接：https://my.oschina.net/yepanl/blog/3051288

文章标签：

#数据结构与算法 #python #嵌入式

本文介绍了内核中工作队列（workqueue）的基本概念、数据结构、工作流程及系统默认创建情况。工作队列是将任务推后执行的机制，基于内核线程实现。文中给出了相关接口，分析了工作队列、工作、工人等数据结构，还介绍了系统默认创建的工作队列和工人，简化了使用方法。

2019独角兽企业重金招聘Python工程师标准>>>

１，基本概念：

工作队列(workqueue)，是内核中一种将任务推后执行的机制。与软中断，tasklet不同的是，工作队列基于内核线程实现，能够休眠。下面，我们首先了解工作队列的几个概念。

（１）work：工作。将要被调度执行的一个任务。

（２）worker：工人。实际调度执行任务的实体。

（３）workqueue：工作队列。包含了一系列工作和工人的对象。

进一步，我们再看看内核提供的有关工作队列的接口：

（１）创建工作队列：create_workqueue

用来创建一个工作队列 workqueue。

（２）创建工人：create_worker

用来创建一个执行工作的工人worker。

（３）调度工作：schedule_work

用户初始化一个work，通过schedule_work接口将work加入工作队列，并调度工作队列执行该任务。

由上所述，我们可以看到工作队列的本质：

（１）首先，创建好一个workqueue，并创建相应的workers，将这些workers挂接到workqueue中。

（２）创建一个work，调用schedule_work()接口将work加入workqueue队列，并调度workqueue中的worker执行。

（３）每个worker是一个内核线程，在内核线程内部执行work的处理函数。

２，数据结构：

（１）work_struct：

struct work_struct {
   atomic_long_t data;        // 工作处理函数数据
   struct list_head entry;    // 链表节点，挂在workqueue上，其实是挂在 worker_pool上
   work_func_t func;    // 工作处理函数
#ifdef CONFIG_LOCKDEP
   struct lockdep_map lockdep_map;
#endif
};

（２）worker：

struct worker {
   /* on idle list while idle, on busy hash table while busy */
   union {
       struct list_head   entry;   /* L: while idle */
       struct hlist_node   hentry;   /* L: while busy */
   };    // 链表节点，挂接在idle/busy链表的节点

   struct work_struct   *current_work;   /* L: work being processed */    // 当前调度处理的工作
   work_func_t       current_func;   /* L: current_work's fn */    // 当前的处理函数
   struct pool_workqueue   *current_pwq; /* L: current_work's pwq */
   struct list_head   scheduled;   /* L: scheduled works */    // 当前被调度的所有工作

/* 64 bytes boundary on 64bit, 32 on 32bit */

   struct task_struct   *task;       /* I: worker task */    // worker线程
   struct worker_pool   *pool;       /* A: the associated pool */    // worker线程池
                       /* L: for rescuers */
   struct list_head   node;       /* A: anchored at pool->workers */    // 挂接在worker_pool下的节点
                       /* A: runs through worker->node */

   unsigned long       last_active;   /* L: last active timestamp */
   unsigned int       flags;       /* X: flags */
   int           id;       /* I: worker id */

   /*
   * Opaque string set with work_set_desc(). Printed out with task
   * dump for debugging - WARN, BUG, panic or sysrq.
   */
   char           desc[WORKER_DESC_LEN];

/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
};

（３）workqueue_struct：

struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
struct list_head list; /* PR: list of all workqueues */ //链表节点，挂接在系统全局workqueue链表上

   struct mutex       mutex;       /* protects this wq */
   int           work_color;   /* WQ: current work color */
   int           flush_color;   /* WQ: current flush color */
   atomic_t       nr_pwqs_to_flush; /* flush in progress */
   struct wq_flusher   *first_flusher;   /* WQ: first flusher */
   struct list_head   flusher_queue;   /* WQ: flush waiters */
   struct list_head   flusher_overflow; /* WQ: flush overflow list */

struct list_head maydays; /* MD: pwqs requesting rescue */
struct worker *rescuer; /* I: rescue worker */

int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */

struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
   struct wq_device   *wq_dev;   /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
   struct lockdep_map   lockdep_map;
#endif
   char           name[WQ_NAME_LEN]; /* I: workqueue name */    // workqueue名字

   /*
   * Destruction of workqueue_struct is sched-RCU protected to allow
   * walking the workqueues list without grabbing wq_pool_mutex.
   * This is used to dump all workqueues from sysrq.
   */
   struct rcu_head       rcu;

   /* hot fields used during command issue, aligned to cacheline */
   unsigned int       flags ____cacheline_aligned; /* WQ: WQ_* flags */
   struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
   struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};

（４）worker_pool：

struct worker_pool {
   spinlock_t       lock;       /* the pool lock */
   int           cpu;       /* I: the associated cpu */
   int           node;       /* I: the associated node ID */
   int           id;       /* I: pool ID */
   unsigned int       flags;       /* X: flags */

unsigned long watchdog_ts; /* L: watchdog timestamp */

struct list_head worklist; /* L: list of pending works */ // 挂接在该worker_pool下的所有work

int nr_workers; /* L: total number of workers */ // 该worker_pool下所有worker的个数
int nr_idle; /* L: currently idle workers */ // 该worker_pool下空闲worker的个数

   struct list_head   idle_list;   /* X: list of idle workers */
   struct timer_list   idle_timer;   /* L: worker idle timeout */
   struct timer_list   mayday_timer;   /* L: SOS timer for workers */

   /* a workers is either on busy_hash or idle_list, or the manager */
   DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
                       /* L: hash of busy workers */

   struct worker       *manager;   /* L: purely informational */
   struct list_head   workers;   /* A: attached workers */    // 该worker_pool下所有worker链表
   struct completion   *detach_completion; /* all workers detached */    //完成量，用于删除worker

struct ida worker_ida; /* worker IDs for task name */

   struct workqueue_attrs   *attrs;       /* I: worker attributes */
   struct hlist_node   hash_node;   /* PL: unbound_pool_hash node */
   int           refcnt;       /* PL: refcnt for unbound pools */

   /*
   * The current concurrency level. As it's likely to be accessed
   * from other CPUs during try_to_wake_up(), put it in a separate
   * cacheline.
   */
   atomic_t       nr_running ____cacheline_aligned_in_smp;

   /*
   * Destruction of pool is sched-RCU protected to allow dereferences
   * from get_work_pool().
   */
   struct rcu_head       rcu;
} ____cacheline_aligned_in_smp;

数据结构和代码的内容比较多，但我们可以绕过这些细节，找到我们关心的核心字段和接口，理解了核心思想之后，再去扩展这些细节。

３，接口实现：

（１）创建workqueue：

#define create_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))

#define alloc_workqueue(fmt, flags, max_active, args...)       \
   __alloc_workqueue_key((fmt), (flags), (max_active),       \
                  NULL, NULL, ##args)

struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                           unsigned int flags,
                           int max_active,
                           struct lock_class_key *key,
                           const char *lock_name, ...)
{
   size_t tbl_size = 0;
   va_list args;
   struct workqueue_struct *wq;
   struct pool_workqueue *pwq;

   /*
   * Unbound && max_active == 1 used to imply ordered, which is no
   * longer the case on NUMA machines due to per-node pools. While
   * alloc_ordered_workqueue() is the right way to create an ordered
   * workqueue, keep the previous behavior to avoid subtle breakages
   * on NUMA.
   */
   if ((flags & WQ_UNBOUND) && max_active == 1)
       flags |= __WQ_ORDERED;

   /* see the comment above the definition of WQ_POWER_EFFICIENT */
   if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
       flags |= WQ_UNBOUND;

   /* allocate wq and format name */
   if (flags & WQ_UNBOUND)
       tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);

   wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);    // 分配workqueue_struct实例
   if (!wq)
       return NULL;

   if (flags & WQ_UNBOUND) {
       wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
       if (!wq->unbound_attrs)
           goto err_free_wq;
   }

   va_start(args, lock_name);
   vsnprintf(wq->name, sizeof(wq->name), fmt, args);    // 赋值workqueue_struct名字
   va_end(args);

max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);

/* init wq */

    // 初始化 workqueue
   wq->flags = flags;
   wq->saved_max_active = max_active;
   mutex_init(&wq->mutex);
   atomic_set(&wq->nr_pwqs_to_flush, 0);
   INIT_LIST_HEAD(&wq->pwqs);
   INIT_LIST_HEAD(&wq->flusher_queue);
   INIT_LIST_HEAD(&wq->flusher_overflow);
   INIT_LIST_HEAD(&wq->maydays);

lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);

if (alloc_and_link_pwqs(wq) < 0)
goto err_free_wq;

if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;

if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;

   /*
   * wq_pool_mutex protects global freeze state and workqueues list.
   * Grab it, adjust max_active and add the new @wq to workqueues
   * list.
   */
   mutex_lock(&wq_pool_mutex);

   mutex_lock(&wq->mutex);
   for_each_pwq(pwq, wq)
       pwq_adjust_max_active(pwq);
   mutex_unlock(&wq->mutex);

list_add_tail_rcu(&wq->list, &workqueues); // 将workqueue加入全局链表 workqueues

mutex_unlock(&wq_pool_mutex);

return wq;

err_free_wq:
   free_workqueue_attrs(wq->unbound_attrs);
   kfree(wq);
   return NULL;
err_destroy:
   destroy_workqueue(wq);
   return NULL;
}

（２）创建 worker：

static struct worker *create_worker(struct worker_pool *pool)
{
   struct worker *worker = NULL;
   int id = -1;
   char id_buf[16];

   /* ID is needed to determine kthread name */
   id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);    // 分配worker_pool下的worker id
   if (id < 0)
       goto fail;

   worker = alloc_worker(pool->node);    // 创建worker实例
   if (!worker)
       goto fail;

worker->id = id; // 赋值worker id

   if (pool->cpu >= 0)
       snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
           pool->attrs->nice < 0 ? "H" : "");
   else
       snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);

   worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
                          "kworker/%s", id_buf);    // 创建worker的内核工作线程
   if (IS_ERR(worker->task))
       goto fail;

set_user_nice(worker->task, pool->attrs->nice);
kthread_bind_mask(worker->task, pool->attrs->cpumask);

/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool); // 将该worker挂接到 worker_pool的workers链表下

   /* start the newly created worker */
   spin_lock_irq(&pool->lock);
   worker->pool->nr_workers++;    // 更新相应worker_pool下的worker个数
   worker_enter_idle(worker);    // 将该worker挂接到 idle链表下
   wake_up_process(worker->task);    // 唤醒该worker内核线程，执行worker_thread
   spin_unlock_irq(&pool->lock);

return worker;

fail:
   if (id >= 0)
       ida_simple_remove(&pool->worker_ida, id);
   kfree(worker);
   return NULL;
}

（３）调度work：

static inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work); // 默认将work加入 system_wq 下，system_wq是系统启动时自动创建的workqueue。如果要调度用户自定义的workqueue，可以直接使用 queue_work 接口。
}

static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

bool queue_work_on(int cpu, struct workqueue_struct *wq,
           struct work_struct *work)
{
   bool ret = false;
   unsigned long flags;

local_irq_save(flags);

   if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
       __queue_work(cpu, wq, work);
       ret = true;
   }

local_irq_restore(flags);
return ret;
}

static void __queue_work(int cpu, struct workqueue_struct *wq,
           struct work_struct *work)
{
   struct pool_workqueue *pwq;
   struct worker_pool *last_pool;
   struct list_head *worklist;
   unsigned int work_flags;
   unsigned int req_cpu = cpu;

   /*
   * While a work item is PENDING && off queue, a task trying to
   * steal the PENDING will busy-loop waiting for it to either get
   * queued or lose PENDING. Grabbing PENDING and queueing should
   * happen with IRQ disabled.
   */
   lockdep_assert_irqs_disabled();

debug_work_activate(work);

   /* if draining, only works from the same workqueue are allowed */
   if (unlikely(wq->flags & __WQ_DRAINING) &&
        WARN_ON_ONCE(!is_chained_work(wq)))
       return;
retry:
   if (req_cpu == WORK_CPU_UNBOUND)
       cpu = wq_select_unbound_cpu(raw_smp_processor_id());

   /* pwq which will be used unless @work is executing elsewhere */
   if (!(wq->flags & WQ_UNBOUND))
       pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
   else
       pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));

   /*
   * If @work was previously on a different pool, it might still be
   * running there, in which case the work needs to be queued on that
   * pool to guarantee non-reentrancy.
   */
   last_pool = get_work_pool(work);    // 找到合适的 worker_pool
   if (last_pool && last_pool != pwq->pool) {
       struct worker *worker;

spin_lock(&last_pool->lock);

worker = find_worker_executing_work(last_pool, work); // 找到合适的worker

       if (worker && worker->current_pwq->wq == wq) {
           pwq = worker->current_pwq;
       } else {
           /* meh... not running there, queue here */
           spin_unlock(&last_pool->lock);
           spin_lock(&pwq->pool->lock);
       }
   } else {
       spin_lock(&pwq->pool->lock);
   }

   /*
   * pwq is determined and locked. For unbound pools, we could have
   * raced with pwq release and it could already be dead. If its
   * refcnt is zero, repeat pwq selection. Note that pwqs never die
   * without another pwq replacing it in the numa_pwq_tbl or while
   * work items are executing on it, so the retrying is guaranteed to
   * make forward-progress.
   */
   if (unlikely(!pwq->refcnt)) {
       if (wq->flags & WQ_UNBOUND) {
           spin_unlock(&pwq->pool->lock);
           cpu_relax();
           goto retry;
       }
       /* oops */
       WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
              wq->name, cpu);
   }

/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);

   if (WARN_ON(!list_empty(&work->entry))) {
       spin_unlock(&pwq->pool->lock);
       return;
   }

pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);

   if (likely(pwq->nr_active < pwq->max_active)) {
       trace_workqueue_activate_work(work);
       pwq->nr_active++;
       worklist = &pwq->pool->worklist;
       if (list_empty(worklist))
           pwq->pool->watchdog_ts = jiffies;
   } else {
       work_flags |= WORK_STRUCT_DELAYED;
       worklist = &pwq->delayed_works;
   }

insert_work(pwq, work, worklist, work_flags); // 将work加入worker_pool的worklist中

spin_unlock(&pwq->pool->lock);
}

static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct list_head *head, unsigned int extra_flags)
{
struct worker_pool *pool = pwq->pool;

   /* we own @work, set data and link */
   set_work_pwq(work, pwq, extra_flags);
   list_add_tail(&work->entry, head);    // 加入worker_pool链表
   get_pwq(pwq);

   /*
   * Ensure either wq_worker_sleeping() sees the above
   * list_add_tail() or we see zero nr_running to avoid workers lying
   * around lazily while there are works to be processed.
   */
   smp_mb();

if (__need_more_worker(pool))
wake_up_worker(pool); // 调度worker_pool执行worker
}

static void wake_up_worker(struct worker_pool *pool)
{
struct worker *worker = first_idle_worker(pool);

if (likely(worker))
wake_up_process(worker->task); // 执行注册的worker内核线程：worker_thread
}

下面我们看看最终的 worker_thread函数具体如何执行的：

static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool; // 找到worker所属的worker_pool

   /* tell the scheduler that this is a workqueue worker */
   set_pf_worker(true);
woke_up:
   spin_lock_irq(&pool->lock);

   /* am I supposed to die? */
   if (unlikely(worker->flags & WORKER_DIE)) {
       spin_unlock_irq(&pool->lock);
       WARN_ON_ONCE(!list_empty(&worker->entry));
       set_pf_worker(false);

       set_task_comm(worker->task, "kworker/dying");
       ida_simple_remove(&pool->worker_ida, worker->id);
       worker_detach_from_pool(worker);
       kfree(worker);
       return 0;
   }

   worker_leave_idle(worker);
recheck:
   /* no more worker necessary? */
   if (!need_more_worker(pool))
       goto sleep;

   /* do we need to manage? */
   if (unlikely(!may_start_working(pool)) && manage_workers(worker))
       goto recheck;

   /*
   * ->scheduled list can only be filled while a worker is
   * preparing to process a work or actually processing it.
   * Make sure nobody diddled with it while I was sleeping.
   */
   WARN_ON_ONCE(!list_empty(&worker->scheduled));

   /*
   * Finish PREP stage. We're guaranteed to have at least one idle
   * worker or that someone else has already assumed the manager
   * role. This is where @worker starts participating in concurrency
   * management if applicable and concurrency management is restored
   * after being rebound. See rebind_workers() for details.
   */
   worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

   do {    // 循环处理 pool下面worklist中的所有work
       struct work_struct *work =
           list_first_entry(&pool->worklist,
                   struct work_struct, entry);    // 取出首个work

pool->watchdog_ts = jiffies;

       if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
           /* optimization path, not strictly necessary */
           process_one_work(worker, work);    // 实际执行用户注册的处理函数
           if (unlikely(!list_empty(&worker->scheduled)))
               process_scheduled_works(worker);
       } else {
           move_linked_works(work, &worker->scheduled, NULL);
           process_scheduled_works(worker);
       }
   } while (keep_working(pool));

   worker_set_flags(worker, WORKER_PREP);
sleep:
   /*
   * pool->lock is held and there's no work to process and no need to
   * manage, sleep. Workers are woken up only while holding
   * pool->lock or from local cpu, so setting the current state
   * before releasing pool->lock is enough to prevent losing any
   * event.
   */
   worker_enter_idle(worker);
   __set_current_state(TASK_IDLE);
   spin_unlock_irq(&pool->lock);
   schedule();
   goto woke_up;
}

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
   struct pool_workqueue *pwq = get_work_pwq(work);
   struct worker_pool *pool = worker->pool;
   bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
   int work_color;
   struct worker *collision;
#ifdef CONFIG_LOCKDEP
   /*
   * It is permissible to free the struct work_struct from
   * inside the function that is called from it, this we need to
   * take into account for lockdep too. To avoid bogus "held
   * lock freed" warnings as well as problems when looking into
   * work->lockdep_map, make a copy and use that here.
   */
   struct lockdep_map lockdep_map;

   lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
   /* ensure we're on the correct CPU */
   WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
             raw_smp_processor_id() != pool->cpu);

   /*
   * A single work shouldn't be executed concurrently by
   * multiple workers on a single cpu. Check whether anyone is
   * already processing the work. If so, defer the work to the
   * currently executing one.
   */
   collision = find_worker_executing_work(pool, work);
   if (unlikely(collision)) {
       move_linked_works(work, &collision->scheduled, NULL);
       return;
   }

   /* claim and dequeue */
   debug_work_deactivate(work);
   hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
   worker->current_work = work;
   worker->current_func = work->func;
   worker->current_pwq = pwq;
   work_color = get_work_color(work);    // 初始化 worker 的工作内容

   /*
   * Record wq name for cmdline and debug reporting, may get
   * overridden through set_worker_desc().
   */
   strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

list_del_init(&work->entry);

   /*
   * CPU intensive works don't participate in concurrency management.
   * They're the scheduler's responsibility. This takes @worker out
   * of concurrency management and the next code block will chain
   * execution of the pending work items.
   */
   if (unlikely(cpu_intensive))
       worker_set_flags(worker, WORKER_CPU_INTENSIVE);

   /*
   * Wake up another worker if necessary. The condition is always
   * false for normal per-cpu workers since nr_running would always
   * be >= 1 at this point. This is used to chain execution of the
   * pending work items for WORKER_NOT_RUNNING workers such as the
   * UNBOUND and CPU_INTENSIVE ones.
   */
   if (need_more_worker(pool))
       wake_up_worker(pool);

   /*
   * Record the last pool and clear PENDING which should be the last
   * update to @work. Also, do this inside @pool->lock so that
   * PENDING and queued state changes happen together while IRQ is
   * disabled.
   */
   set_work_pool_and_clear_pending(work, pool->id);

spin_unlock_irq(&pool->lock);

   lock_map_acquire(&pwq->wq->lockdep_map);
   lock_map_acquire(&lockdep_map);
   /*
   * Strictly speaking we should mark the invariant state without holding
   * any locks, that is, before these two lock_map_acquire()'s.
   *
   * However, that would result in:
   *
   *   A(W1)
   *   WFC(C)
   *       A(W1)
   *       C(C)
   *
   * Which would create W1->C->W1 dependencies, even though there is no
   * actual deadlock possible. There are two solutions, using a
   * read-recursive acquire on the work(queue) 'locks', but this will then
   * hit the lockdep limitation on recursive locks, or simply discard
   * these locks.
   *
   * AFAICT there is no possible deadlock scenario between the
   * flush_work() and complete() primitives (except for single-threaded
   * workqueues), so hiding them isn't a problem.
   */
   lockdep_invariant_state(true);
   trace_workqueue_execute_start(work);
   worker->current_func(work);    // 最终调度用户注册的处理函数
   /*
   * While we must be careful to not use "work" after this, the trace
   * point will only record its address.
   */
   trace_workqueue_execute_end(work);
   lock_map_release(&lockdep_map);
   lock_map_release(&pwq->wq->lockdep_map);

   if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
       pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
               "     last function: %pf\n",
               current->comm, preempt_count(), task_pid_nr(current),
               worker->current_func);
       debug_show_held_locks(current);
       dump_stack();
   }

   /*
   * The following prevents a kworker from hogging CPU on !PREEMPT
   * kernels, where a requeueing work item waiting for something to
   * happen could deadlock with stop_machine as such work item could
   * indefinitely requeue itself while all other CPUs are trapped in
   * stop_machine. At the same time, report a quiescent RCU state so
   * the same condition doesn't freeze RCU.
   */
   cond_resched();

spin_lock_irq(&pool->lock);

   /* clear cpu intensive status */
   if (unlikely(cpu_intensive))
       worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

   /* we're done with it, release */
   hash_del(&worker->hentry);
   worker->current_work = NULL;
   worker->current_func = NULL;
   worker->current_pwq = NULL;
   pwq_dec_nr_in_flight(pwq, work_color);
}

到目前为止，我们从宏观的角度已经分析了workqueue的整体工作流程。但是还有一点我们也需要了解，就是系统在初始化的过程中，给我们创建了几个默认的 workqueue，使得我们不必再去主动创建workqueue和worker。在使用这些 workqueue的时候，我们只需要调用 schedule_work(默认注册到system_wq)或者queue_work(指定系统中已经存在的workqueue)接口就好。这大大的简化了workqueue的使用方法，使得用户编程更加便捷。

（１）系统默认创建的workqueue：

int __init workqueue_init_early(void)
{
   int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
   int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
   int i, cpu;

WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));

pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

   /* initialize CPU pools */
   for_each_possible_cpu(cpu) {
       struct worker_pool *pool;

       i = 0;
       for_each_cpu_worker_pool(pool, cpu) {
           BUG_ON(init_worker_pool(pool));
           pool->cpu = cpu;
           cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
           pool->attrs->nice = std_nice[i++];
           pool->node = cpu_to_node(cpu);

           /* alloc pool ID */
           mutex_lock(&wq_pool_mutex);
           BUG_ON(worker_pool_assign_id(pool));
           mutex_unlock(&wq_pool_mutex);
       }
   }

   /* create default unbound and ordered wq attrs */
   for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
       struct workqueue_attrs *attrs;

       BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
       attrs->nice = std_nice[i];
       unbound_std_wq_attrs[i] = attrs;

       /*
       * An ordered wq should have only one pwq as ordering is
       * guaranteed by max_active which is enforced by pwqs.
       * Turn off NUMA so that dfl_pwq is used for all nodes.
       */
       BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
       attrs->nice = std_nice[i];
       attrs->no_numa = true;
       ordered_wq_attrs[i] = attrs;
   }

// 以下是系统默认创建的workqueue

   system_wq = alloc_workqueue("events", 0, 0);
   system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
   system_long_wq = alloc_workqueue("events_long", 0, 0);
   system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
                        WQ_UNBOUND_MAX_ACTIVE);
   system_freezable_wq = alloc_workqueue("events_freezable",
                          WQ_FREEZABLE, 0);
   system_power_efficient_wq = alloc_workqueue("events_power_efficient",
                          WQ_POWER_EFFICIENT, 0);
   system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
                          WQ_FREEZABLE | WQ_POWER_EFFICIENT,
                          0);
   BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
           !system_unbound_wq || !system_freezable_wq ||
           !system_power_efficient_wq ||
           !system_freezable_power_efficient_wq);

return 0;
}

（２）系统默认创建的 worker：

int __init workqueue_init(void)
{
   struct workqueue_struct *wq;
   struct worker_pool *pool;
   int cpu, bkt;

   /*
   * It'd be simpler to initialize NUMA in workqueue_init_early() but
   * CPU to node mapping may not be available that early on some
   * archs such as power and arm64. As per-cpu pools created
   * previously could be missing node hint and unbound pools NUMA
   * affinity, fix them up.
   *
   * Also, while iterating workqueues, create rescuers if requested.
   */
   wq_numa_init();

mutex_lock(&wq_pool_mutex);

   for_each_possible_cpu(cpu) {
       for_each_cpu_worker_pool(pool, cpu) {
           pool->node = cpu_to_node(cpu);
       }
   }

   list_for_each_entry(wq, &workqueues, list) {
       wq_update_unbound_numa(wq, smp_processor_id(), true);
       WARN(init_rescuer(wq),
             "workqueue: failed to create early rescuer for %s",
             wq->name);
   }

mutex_unlock(&wq_pool_mutex);

   /* create the initial workers */
   for_each_online_cpu(cpu) {
       for_each_cpu_worker_pool(pool, cpu) {
           pool->flags &= ~POOL_DISASSOCIATED;
           BUG_ON(!create_worker(pool));    // 创建默认worker
       }
   }

hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
BUG_ON(!create_worker(pool)); // 创建默认worker

wq_online = true;
wq_watchdog_init();

return 0;

转载于:https://my.oschina.net/yepanl/blog/3051288