Linux中断子系统12_linux中断子系统源码-优快云博客

Linux中断子系统12（基于Linux6.6）---中断之workqueue3

一、前情回顾

之前文章介绍可以用下面几个函数来创建workqueue。

include/linux/workqueue.h

#define alloc_ordered_workqueue(fmt, flags, args...)			\
	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED |		\
			__WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

#define create_workqueue(name)						\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)				\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |	\
			WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)				\
	alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

见得分析上面函数，可以看到调用都是alloc_workqueue，它的代码如下：

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
					 unsigned int flags,
					 int max_active, ...)

主要以alloc_workqueue函数为主线，描述CMWQ中的创建一个workqueue实例的代码过程。

二、工作、工作队列、工作线程池、工作线程数据结构

workqueue机制最小的调度单元是work_struct ，即工作任务。

include/linux/workqueue.h

 
struct work_struct {
	atomic_long_t data;            //低比特位部分是work的标志位，剩余比特位通常用于存放上一次运行的worker_pool ID或pool_workqueue的指针。存放的内容由WORK_STRUCT_PWQ标志位来决定
	struct list_head entry;        //用于把work挂到工作队列上
	work_func_t func;              //作任务的处理函数
#ifdef CONFIG_LOCKDEP
	struct lockdep_map lockdep_map;
#endif
};

工作队列由struct workqueue_struct数据结构描述：kernel/workqueue.c

  
/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
	struct list_head	pwqs;		/* WR: all pwqs of this wq 该workqueue所在的所有pool_workqueue链表 */
	struct list_head	list;		/* PR: list of all workqueues 系统所有workqueue_struct的全局链表*/
 
	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */
 
	struct list_head	maydays;	/* MD: pwqs requesting rescue 所有rescue状态下的pool_workqueue数据结构链表 */
	struct worker		*rescuer;	/* I: rescue workerrescue内核线程，内存紧张时创建新的工作线程可能会失败，如果创建workqueue是设置了WQ_MEM_RECLAIM，那么rescuer线程会接管这种情况。 */
 
	int			nr_drainers;	/* WQ: drain in progress */
	int			saved_max_active; /* WQ: saved pwq max_active */
 
	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs UNBOUND类型属性 */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs unbound类型的pool_workqueue */
 
#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
	struct lockdep_map	lockdep_map;
#endif
	char			name[WQ_NAME_LEN]; /* I: workqueue name  该workqueue的名字 */
 
	/*
	 * Destruction of workqueue_struct is sched-RCU protected to allow
	 * walking the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;
 
	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags 经常被不同CUP访问，因此要和cache line对齐 */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs  指向per cpu的pool workqueue   */
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node 指向per node的pool workqueue  */
};

运行work_struct的内核线程被称为worker，即工作线程。

kernel/workqueue_internal.h

  
/*
 * The poor guys doing the actual heavy lifting.  All on-duty workers are
 * either serving the manager role, on idle list or on busy hash.  For
 * details on the locking annotation (L, I, X...), refer to workqueue.c.
 *
 * Only to be used in workqueue and async.
 */
struct worker {
	/* on idle list while idle, on busy hash table while busy */
	union {
		struct list_head	entry;	/* L: while idle */
		struct hlist_node	hentry;	/* L: while busy */
	};
 
	struct work_struct	*current_work;	/* L: work being processed 当前正在处理的work */
	work_func_t		current_func;	/* L: current_work's fn 当前正在执行的work回调函数 */
	struct pool_workqueue	*current_pwq; /* L: current_work's pwq 当前work所属的pool_workqueue */
	struct list_head	scheduled;	/* L: scheduled works 所有被调度并正准备执行的work_struct都挂入该链表中 */
 
	/* 64 bytes boundary on 64bit, 32 on 32bit */
 
	struct task_struct	*task;		/* I: worker task 该工作线程的task_struct数据结构 */
	struct worker_pool	*pool;		/* A: the associated pool 该工作线程所属的worker_pool */
						/* L: for rescuers */
	struct list_head	node;		/* A: anchored at pool->workers 可以把该worker挂入到worker_pool->workers链表中 */
						/* A: runs through worker->node */
 
	unsigned long		last_active;	/* L: last active timestamp */
	unsigned int		flags;		/* X: flags */
	int			id;		/* I: worker id */
 
	/*
	 * Opaque string set with work_set_desc().  Printed out with task
	 * dump for debugging - WARN, BUG, panic or sysrq.
	 */
	char			desc[WORKER_DESC_LEN];
 
	/* used only by rescuers to point to the target workqueue */
	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */
};

CMWQ提出了工作线程池的概念，struct worker_pool数据结构用于描述工作线程池。

worker_pool是per-cpu变量，每个CPU都有worker_pool，而且有两个worker_pool。

一个用于普通优先级工作线程，另一个用于高优先级工作线程。

kernel/workqueue.c


struct worker_pool {
	spinlock_t		lock;		/* the pool lock 用于保护worker_pool的自旋锁 */
	int			cpu;		/* I: the associated cpu对于unbound类型为-1；对于bound类型workqueue表示绑定的CPU ID */
	int			node;		/* I: the associated node ID */
	int			id;		/* I: pool ID 该worker_pool的ID号 */
	unsigned int		flags;		/* X: flags */
 
	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
 
	struct list_head	worklist;	/* L: list of pending works 挂入pending状态的work_struct */
 
	int			nr_workers;	/* L: total number of workers 工作线程的数量 */
	int			nr_idle;	/* L: currently idle workers 处于idle状态的工作线程的数量 */
 
	struct list_head	idle_list;	/* X: list of idle workers 处于idle状态的工作线程链表 */
	struct timer_list	idle_timer;	/* L: worker idle timeout */
	struct timer_list	mayday_timer;	/* L: SOS timer for workers */
 
	/* a workers is either on busy_hash or idle_list, or the manager */
	DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
						/* L: hash of busy workers */
 
	struct worker		*manager;	/* L: purely informational */
	struct list_head	workers;	/* A: attached workers 该worker_pool管理的工作线程链表 */
	struct completion	*detach_completion; /* all workers detached */
 
	struct ida		worker_ida;	/* worker IDs for task name */
 
	struct workqueue_attrs	*attrs;		/* I: worker attributes 工作线程属性 */
	struct hlist_node	hash_node;	/* PL: unbound_pool_hash node */
	int			refcnt;		/* PL: refcnt for unbound pools */
 
	/*
	 * The current concurrency level.  As it's likely to be accessed
	 * from other CPUs during try_to_wake_up(), put it in a separate
	 * cacheline.
     * 用于管理worker的创建和销毁的统计计数，表示运行中的worker数量。该变量可能被多CPU同时访问，因此独占一个缓存行
	 */
	atomic_t		nr_running ____cacheline_aligned_in_smp;
 
	/*
	 * Destruction of pool is sched-RCU protected to allow dereferences
	 * from get_work_pool().
	 */
	struct rcu_head		rcu;
} ____cacheline_aligned_in_smp;

struct pool_workqueue用于链接workqueue和worker_pool。

kernel/workqueue.c

  
/*
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
 * point to the pwq; thus, pwqs need to be aligned at two's power of the
 * number of flag bits.
 */
struct pool_workqueue {
	struct worker_pool	*pool;		/* I: the associated pool 指向worker_pool结构 */
	struct workqueue_struct *wq;		/* I: the owning workqueue 指向workqueue_struct结构 */
	int			work_color;	/* L: current color */
	int			flush_color;	/* L: flushing color */
	int			refcnt;		/* L: reference count */
	int			nr_in_flight[WORK_NR_COLORS];
						/* L: nr of in_flight works */
	int			nr_active;	/* L: nr of active works 活跃的work_strcut数量 */
	int			max_active;	/* L: max active works 最大活跃work_struct数量 */
	struct list_head	delayed_works;	/* L: delayed works 延迟执行work_struct链表 */
	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
	struct list_head	mayday_node;	/* MD: node on wq->maydays */
 
	/*
	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
	 * itself is also sched-RCU protected so that the first pwq can be
	 * determined without grabbing wq->mutex.
	 */
	struct work_struct	unbound_release_work;
	struct rcu_head		rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);

三、代码分析

首先列出这个函数的代码。kernel/workqueue.c

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
					 unsigned int flags,
					 int max_active, ...)
{
	va_list args;
	struct workqueue_struct *wq;
	struct pool_workqueue *pwq;
 
	/*
	 * Unbound && max_active == 1 used to imply ordered, which is no
	 * longer the case on NUMA machines due to per-node pools.  While
	 * alloc_ordered_workqueue() is the right way to create an ordered
	 * workqueue, keep the previous behavior to avoid subtle breakages
	 * on NUMA.
	 */
	if ((flags & WQ_UNBOUND) && max_active == 1)        //见下面分析 1
		flags |= __WQ_ORDERED;
 
	/* see the comment above the definition of WQ_POWER_EFFICIENT */
	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)    //见下面分析2
		flags |= WQ_UNBOUND;
 
	/* allocate wq and format name */
	if (flags & WQ_UNBOUND)                //见下面分析3
		tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);    //计算numa_pwq_tbl要占用的大小
 
	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);        //分配workqueue_struct 
	if (!wq)
		return NULL;
 
	if (flags & WQ_UNBOUND) {
		wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);    //unbound类型的wq要有sttribute
		if (!wq->unbound_attrs)
			goto err_free_wq;
	}
 
    //分析4
	va_start(args, lock_name);
	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
	va_end(args);
 
	max_active = max_active ?: WQ_DFL_ACTIVE;
	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
	/* init wq */
	wq->flags = flags;
	wq->saved_max_active = max_active;
	mutex_init(&wq->mutex);
	atomic_set(&wq->nr_pwqs_to_flush, 0);
	INIT_LIST_HEAD(&wq->pwqs);
	INIT_LIST_HEAD(&wq->flusher_queue);
	INIT_LIST_HEAD(&wq->flusher_overflow);
	INIT_LIST_HEAD(&wq->maydays);
 
	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
	INIT_LIST_HEAD(&wq->list);
 
 
    //分析 5 
	if (alloc_and_link_pwqs(wq) < 0)
		goto err_free_wq;
 
	if (wq_online && init_rescuer(wq) < 0)
		goto err_destroy;
 
	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
		goto err_destroy;
 
	/*
	 * wq_pool_mutex protects global freeze state and workqueues list.
	 * Grab it, adjust max_active and add the new @wq to workqueues
	 * list.
	 */
	mutex_lock(&wq_pool_mutex);
 
	mutex_lock(&wq->mutex);
	for_each_pwq(pwq, wq)
		pwq_adjust_max_active(pwq);
	mutex_unlock(&wq->mutex);
 
	list_add_tail_rcu(&wq->list, &workqueues);
 
	mutex_unlock(&wq_pool_mutex);
 
	return wq;
err_free_wq:
	free_workqueue_attrs(wq->unbound_attrs);
	kfree(wq);
	return NULL;
err_destroy:
	destroy_workqueue(wq);
	return NULL;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);

分析1：

if ((flags & WQ_UNBOUND) && max_active == 1)
flags |= __WQ_ORDERED;

对于最大worker为1且没绑定具体cpu的workqueue，系统也是默认整个workqueue是有序执行的。

虽然正确的使用有序工作队列应该使用下面的这个宏，但不能保证那个奇葩自己直接调用__alloc_workqueue_key函数，所以还是要在开始再判断一次。还有一种就是非统一内存访问的cpu也要强制加上这个标志，保证统一性。

 #define alloc_ordered_workqueue(fmt, flags, args...)			\
	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED |		\
			__WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

分析2：

 	/* see the comment above the definition of WQ_POWER_EFFICIENT */
	if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
		flags |= WQ_UNBOUND;

在kernel中，有两种线程池，一种是线程池是per cpu的，也就是说，系统中有多少个cpu，就会创建多少个线程池，cpu x上的线程池创建的worker线程也只会运行在cpu x上。另外一种是unbound thread pool，该线程池创建的worker线程可以调度到任意的cpu上去。

由于cache locality的原因，per cpu的线程池的性能会好一些，但是对power saving有一些影响。设计往往如此，workqueue需要在performance和power saving之间平衡，想要更好的性能，那么最好让一个cpu上的worker thread来处理work，这样的话，cache命中率会比较高，性能会更好。但是，从电源管理的角度来看，最好的策略是让idle状态的cpu尽可能的保持idle，而不是反复idle，working，idle again。

举例：

在t1时刻，work被调度到CPU A上执行，t2时刻work执行完毕，CPU A进入idle，t3时刻有一个新的work需要处理，这时候调度work到那个CPU会好些呢？是处于working状态的CPU B还是处于idle状态的CPU A呢？如果调度到CPU A上运行，那么，由于之前处理过work，其cache内容新鲜热辣，处理起work当然是得心应手，速度很快，但是，这需要将CPU A从idle状态中唤醒。选择CPU B呢就不存在将CPU 从idle状态唤醒，从而获取power saving方面的好处。

3.1、workqueue

两个参数可以控制workqueue在performance和power saving之间的平衡：

1、各个workqueue需要通过WQ_POWER_EFFICIENT来标记自己在功耗方面的属性

2、系统级别的内核参数workqueue.power_efficient。

使用workqueue的用户知道自己在电源管理方面的特点，如果该workqueue在unbound的时候会极大的降低功耗，那么就需要加上WQ_POWER_EFFICIENT的标记。这时候，如果没有标记WQ_UNBOUND，那么缺省workqueue会创建per cpu thread pool来处理work。不过，也可以通过workqueue.power_efficient这个内核参数来修改workqueue的行为：

 /* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);

如果wq_power_efficient设定为true，那么WQ_POWER_EFFICIENT的标记的workqueue就会强制按照unbound workqueue来处理，即使没有标记WQ_UNBOUND。

分析3：

 	/* allocate wq and format name */
	if (flags & WQ_UNBOUND)
		tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
 
	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
	if (!wq)
		return NULL;
 
	if (flags & WQ_UNBOUND) {
		wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
		if (!wq->unbound_attrs)
			goto err_free_wq;
	}

系统维护了一个所有workqueue的list，list head定义如下：

static LIST_HEAD(workqueues);		/* PR: list of all workqueues */

workqueue_struct中的list成员就是挂入这个链表的节点。

workqueue有两种：unbound workqueue和per cpu workqueue。对于per cpu类型，cpu_pwqs指向了一组per cpu的pool_workqueue数据结构，用来维护workqueue和per cpu thread pool之间的关系。每个cpu都有两个thread pool，normal和高优先级的线程池，到底cpu_pwqs指向哪一个pool_workqueue（worker thread）是和workqueue的flag相关，如果标有WQ_HIGHPRI，那么cpu_pwqs指向高优先级的线程池。

3.2、workqueue attribute

挂入workqueue的work终究是需要worker线程来处理，针对worker线程有下面几个考量点（称之attribute）：

（1）该worker线程的优先级

（2）该worker线程运行在哪一个CPU上

（3）如果worker线程可以运行在多个CPU上，且这些CPU属于不同的NUMA（Non Uniform Memory Access Architecture 非统一内存访问） node，那么是否在所有的NUMA node中都可以获取良好的性能。

对于per-CPU的workqueue，2和3不存在问题，哪个cpu上queue的work就在哪个cpu上执行，由于只能在一个确定的cpu上执行，因此起NUMA的node也是确定的（一个CPU不可能属于两个NUMA node）。

置于优先级，per-CPU的workqueue使用WQ_HIGHPRI来标记。综上所述，per-CPU的workqueue不需要单独定义一个workqueue attribute，这也是为何在workqueue_struct中只有unbound_attrs这个成员来记录unbound workqueue的属性。

unbound workqueue由于不绑定在具体的cpu上，可以运行在系统中的任何一个cpu，直觉上似乎系统中有一个unbound thread pool就可以了，

不过让一个thread pool创建多种属性的worker线程是一个好的设计吗？本质上，thread pool应该创建属性一样的worker thread。因此，通过workqueue属性来对unbound workqueue进行分类，workqueue属性定义如下：include/linux/workqueue.h

 /**
 * struct workqueue_attrs - A struct for workqueue attributes.
 *
 * This can be used to change attributes of an unbound workqueue.
 */
struct workqueue_attrs {
	/**
	 * @nice: nice level
	 */
	int nice;
 
	/**
	 * @cpumask: allowed CPUs
	 */
	cpumask_var_t cpumask;
 
	bool affn_strict;
	enum wq_affn_scope affn_scope;

	/**
	 * @ordered: work items must be executed one by one in queueing order
	 */
	bool ordered;
};

nice是一个和thread优先级相关的属性，nice越低则优先级越高。
cpumask是该workqueue挂入的work允许在哪些cpu上运行。
no_numa是一个和NUMA affinity相关的设定。

3.3、unbound workqueue和NUMA之间的联系

在多核 NUMA 系统中，unbound workqueue 和 NUMA 的联系主要体现在以下几个方面：

(1) CPU 和内存的局部性

NUMA 系统中的每个 CPU 核心都有一个与之相关联的内存节点（local node）。当任务在工作队列中执行时，如果任务被分配到不同的 CPU 上，可能会导致任务在访问内存时不再局部，尤其是当任务执行的 CPU 和任务所需的内存不在同一个 NUMA 节点时，会引起内存访问延迟。

虽然 unbound workqueue 允许任务在任何空闲的 CPU 上执行，但这可能导致任务在执行时跨 NUMA 节点访问内存，从而增加内存访问的延迟。因此，在高负载和高性能要求的系统中，任务的 NUMA 性能可能会受到影响。

(2) NUMA 节点亲和性（NUMA Node Affinity）

为了提高 NUMA 系统的性能，操作系统通常会尝试将进程或线程的执行绑定到特定的 NUMA 节点。这种机制可以通过在工作队列中设置 CPU 亲和性来实现。例如，如果工作队列中的任务频繁访问某个 NUMA 节点的内存，那么将该任务分配到距离该内存最近的 CPU 上执行，可以减少跨节点的内存访问延迟。

对于 unbound workqueue，内核通常不做 CPU 的固定绑定，因此它可能会跨 NUMA 节点执行任务，这会导致内存访问的效率降低。为了解决这个问题，可以通过以下几种方式来优化：

NUMA 优化的工作队列：虽然 unbound workqueue 默认不限制 CPU 亲和性，但内核也可以通过 NUMA 优化策略，将任务调度到合适的 CPU 上。比如，可以通过 create_workqueue() 函数的参数来设定 CPU 的亲和性，以避免任务过多地跨 NUMA 节点进行调度。
软 NUMA 或 CPU 亲和性（CPU Affinity）：操作系统也可以通过设置软 NUMA 亲和性来优化工作队列的调度，使得任务在某个 NUMA 节点的 CPU 上执行，以减少跨节点的延迟。

(3) 优化工作队列的 NUMA 性能

为了提高 unbound workqueue 的 NUMA 性能，系统可以通过以下方法进行优化：

绑定任务到特定的 NUMA 节点：可以为工作队列中的任务设置 CPU 亲和性，使其更倾向于在特定的 NUMA 节点上执行。这有助于减少跨 NUMA 节点的内存访问。
NUMA-aware workqueue：内核可以采用 NUMA-aware 的工作队列策略。工作队列在执行时可能会根据任务的内存访问模式来选择最合适的 CPU 和 NUMA 节点，从而避免跨节点访问内存。
使用 mpol 和 set_mempolicy 等接口：这些接口可以帮助在 NUMA 系统上控制内存的分配策略，并为工作队列的任务分配更合适的内存。

(4) NUMA 优化的调度策略

内核调度器也可以采用 NUMA-aware 的调度策略，确保线程和工作队列任务的 CPU 分配与内存访问的局部性一致。例如，如果一个任务主要访问某个 NUMA 节点的内存，调度器可以尝试将任务分配到这个节点的 CPU 上执行，从而提高性能。

当然，是否使用per node的pool workqueue用户是可以通过下面的参数进行设定的：

（1）workqueue attribute中的no_numa成员。

（2）通过workqueue.disable_numa这个参数，disable所有workqueue的numa affinity的支持。

static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

分析4、初始化workqueue的成员

除了max active，没有什么要说的，代码都简单而且直观。如果用户没有设定max active（或者说max active等于0），那么系统会给出一个缺省的设定。系统定义了两个最大值WQ_MAX_ACTIVE（512）和WQ_UNBOUND_MAX_ACTIVE（和cpu数目有关，最大值是cpu数目乘以4，当然也不能大于WQ_MAX_ACTIVE），分别限定per cpu workqueue和unbound workqueue的最大可以创建的worker thread的数目。wq_clamp_max_active可以将max active限制在一个确定的范围内。

分析5

kernel/workqueue.c

  
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
	bool highpri = wq->flags & WQ_HIGHPRI;            //获取normal or high priority
	int cpu, ret;
 
	if (!(wq->flags & WQ_UNBOUND)) {                  //per cpu workqueue的处理 
		wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);    //为每个cpu分配一个pool_workqueue数据结构
		if (!wq->cpu_pwqs)
			return -ENOMEM;
 
		for_each_possible_cpu(cpu) {                  //逐个cpu进行设定 
			struct pool_workqueue *pwq =
				per_cpu_ptr(wq->cpu_pwqs, cpu);       //获取本cpu的pool_workqueue 
			struct worker_pool *cpu_pools =    
				per_cpu(cpu_worker_pools, cpu);       //获取本cpu的worker_pool 
 
 
                    //将动态分配的cpu_pwqs和静态定义的cpu_worker_pools关联起来
			init_pwq(pwq, wq, &cpu_pools[highpri]);
 
			mutex_lock(&wq->mutex);
			link_pwq(pwq);        //把pool_workqueue添加到workqueue_struct->pwqs链表中
			mutex_unlock(&wq->mutex);
		}
		return 0;
	} else if (wq->flags & __WQ_ORDERED) {        //wq加入到ordered_wq_attrs的处理 (有序未绑定cpu)
		ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
		/* there should only be single pwq for ordering guarantee */
		WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
			      wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
		     "ordering guarantee broken for workqueue %s\n", wq->name);
		return ret;
	} else {            //wq加入到unbound_std_wq_attrs的处理 (无序未绑定cpu)
		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
	}
}

通过alloc_percpu可以为每一个cpu分配一个pool_workqueue的memory。每个pool_workqueue都有一个对应的worker thread pool，对于per-CPU workqueue，它是静态定义的，如下：

 /* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools);
 
 
//NR_STD_WORKER_POOLS = 2,  表示每个cpu定义两个标准的worker pool
//上面这个宏也就表示,每个cpu定义了两个struct worker_pool,  结构体的名字为cpu_worker_pools

对于未绑定cpu的wq，系统也定义了相关属性的指针（也是分为normal和high两种）

kernel/workqueue.c

 /* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
 
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];

当然这连个真真的实例化是在系统初始化阶段，给动态申请内存的。
将动态分配的cpu_pwqs和静态定义的cpu_worker_pools关联起来

kernel/workqueue.c

/* initialize newly allocated @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
		     struct worker_pool *pool)
{
	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);

	memset(pwq, 0, sizeof(*pwq));

	pwq->pool = pool;
	pwq->wq = wq;
	pwq->flush_color = -1;
	pwq->refcnt = 1;
	INIT_LIST_HEAD(&pwq->inactive_works);
	INIT_LIST_HEAD(&pwq->pwqs_node);
	INIT_LIST_HEAD(&pwq->mayday_node);
	kthread_init_work(&pwq->release_work, pwq_release_workfn);
}

pool_workqueue添加到workqueue_struct->pwqs链表中。kernel/workqueue.c

 /* sync @pwq with the current state of its associated wq and link it */
static void link_pwq(struct pool_workqueue *pwq)
{
	struct workqueue_struct *wq = pwq->wq;
 
	lockdep_assert_held(&wq->mutex);
 
	/* may be called multiple times, ignore if already linked */
	if (!list_empty(&pwq->pwqs_node))
		return;
 
	/* set the matching work_color */
	pwq->work_color = wq->work_color;
 
	/* sync max_active to the current setting */
	pwq_adjust_max_active(pwq);
 
	/* link in @pwq */
	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
}

unbound workqueue有两种：

一种是normal type。

另外一种是ordered type。这种workqueue上的work是严格按照顺序执行的，不存在并发问题。ordered unbound workqueue的行为类似过去的single thread workqueue。

但是，无论那种类型的unbound workqueue都使用apply_workqueue_attrs来建立workqueue、pool wq和thread pool之间的关系。kernel/workqueue.c


static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
					const struct workqueue_attrs *attrs)
{
	struct apply_wqattrs_ctx *ctx;

 
	/* only unbound workqueues can change attributes */
	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))    //参数检查,这里是只处理未绑定cpu的
		return -EINVAL;
 
	/* creating multiple pwqs breaks ordering guarantee */
	if (!list_empty(&wq->pwqs)) {       //参数检查,__WQ_ORDERED_EXPLICIT标志的属于他的pwq只能有一个
		if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
			return -EINVAL;
 
		wq->flags &= ~__WQ_ORDERED;
	}
 
	ctx = apply_wqattrs_prepare(wq, attrs);        //申请特定属性的wq
	if (!ctx)
		return -ENOMEM;
 
	/* the ctx has been prepared successfully, let's commit it */
	apply_wqattrs_commit(ctx);    //提交安装,见最后面的分析 7
	apply_wqattrs_cleanup(ctx);
 
	return 0;
}

kernel/workqueue.c

  
 
 
/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
		      const struct workqueue_attrs *attrs)
{
	struct apply_wqattrs_ctx *ctx;
	struct workqueue_attrs *new_attrs, *tmp_attrs;
	int node;
 
	lockdep_assert_held(&wq_pool_mutex);
 
	ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);
 
	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);    //申请一个workqueue_attrs
	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);    //申请临时sttr
	if (!ctx || !new_attrs || !tmp_attrs)
		goto out_free;
 
	/*
	 * Calculate the attrs of the default pwq.
	 * If the user configured cpumask doesn't overlap with the
	 * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
	 */
	copy_workqueue_attrs(new_attrs, attrs);    //系统默认的拷贝的新的
	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);    //对新的设置属性
	if (unlikely(cpumask_empty(new_attrs->cpumask)))    //检视是不是对所有cpu都无效
		cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
 
	/*
	 * We may create multiple pwqs with differing cpumasks.  Make a
	 * copy of @new_attrs which will be modified and used to obtain
	 * pools.
	 */
	copy_workqueue_attrs(tmp_attrs, new_attrs);    //拷贝一份副本
 
	/*
	 * If something goes wrong during CPU up/down, we'll fall back to
	 * the default pwq covering whole @attrs->cpumask.  Always create
	 * it even if we don't use it immediately.
	 */
	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);    //分配default pool workqueue
	if (!ctx->dfl_pwq)
		goto out_free;
 
     //遍历node 
	for_each_node(node) {       
		if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {    //是否使用default pool wq 
			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);    该node使用自己的pool wq 
			if (!ctx->pwq_tbl[node])
				goto out_free;
		} else {
			ctx->dfl_pwq->refcnt++;              //默认的使用计数
			ctx->pwq_tbl[node] = ctx->dfl_pwq;   //该node使用default pool wq 
		}
	}
 
	/* save the user configured attrs and sanitize it. */
	copy_workqueue_attrs(new_attrs, attrs);
	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
	ctx->attrs = new_attrs;
 
	ctx->wq = wq;
	free_workqueue_attrs(tmp_attrs);            //释放掉临时的workqueue_attrs
	return ctx;
 
out_free:
	free_workqueue_attrs(tmp_attrs);
	free_workqueue_attrs(new_attrs);
	apply_wqattrs_cleanup(ctx);
	return NULL;
}
 
 
 
 
/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
					const struct workqueue_attrs *attrs)
{
	struct worker_pool *pool;
	struct pool_workqueue *pwq;
 
	lockdep_assert_held(&wq_pool_mutex);
 
	pool = get_unbound_pool(attrs);        //获取一个worker_pool
	if (!pool)
		return NULL;
 
    //指定内存节点份分配pool_workqueue ,后面 分析6
	pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
	if (!pwq) {
		put_unbound_pool(pool);
		return NULL;
	}
 
	init_pwq(pwq, wq, pool);        //未绑定cpu的也要对应的worker_pool和wq绑定到pool_workqueue上
	return pwq;
}

pwq_tbl数组用来保存unbound workqueue各个node的pool workqueue的指针，new_attrs和tmp_attrs都是一些计算workqueue attribute的中间变量，开始的时候设定为用户传入的workqueue的attribute。

如何为unbound workqueue的pool workqueue寻找对应的线程池？

具体的代码在get_unbound_pool函数中：

kernel/workqueue.c

  
/**
 * get_unbound_pool - get a worker_pool with the specified attributes
 * @attrs: the attributes of the worker_pool to get
 *
 * Obtain a worker_pool which has the same attributes as @attrs, bump the
 * reference count and return it.  If there already is a matching
 * worker_pool, it will be used; otherwise, this function attempts to
 * create a new one.
 *
 * Should be called with wq_pool_mutex held.
 *
 * Return: On success, a worker_pool with the same attributes as @attrs.
 * On failure, %NULL.
 */
static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
{
	u32 hash = wqattrs_hash(attrs);
	struct worker_pool *pool;
	int node;
	int target_node = NUMA_NO_NODE;
 
	lockdep_assert_held(&wq_pool_mutex);
 
	/* do we already have a matching pool?  有相同属相的,则不需要再创建了新的线程池 */
	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
		if (wqattrs_equal(pool->attrs, attrs)) {
			pool->refcnt++;
			return pool;
		}
	}
 
	/* if cpumask is contained inside a NUMA node, we belong to that node */
	if (wq_numa_enabled) {
		for_each_node(node) {
			if (cpumask_subset(attrs->cpumask,
					   wq_numa_possible_cpumask[node])) {
				target_node = node;
				break;
			}
		}
	}
 
	/* nope, create a new one,没有相同的,创建一个这个属性的线程池 */
	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
	if (!pool || init_worker_pool(pool) < 0)
		goto fail;
 
	lockdep_set_subclass(&pool->lock, 1);	/* see put_pwq() */
	copy_workqueue_attrs(pool->attrs, attrs);
	pool->node = target_node;
 
	/*
	 * no_numa isn't a worker_pool attribute, always clear it.  See
	 * 'struct workqueue_attrs' comments for detail.
	 */
	pool->attrs->no_numa = false;
 
	if (worker_pool_assign_id(pool) < 0)
		goto fail;
 
	/* create and start the initial worker */
	if (wq_online && !create_worker(pool))
		goto fail;
 
	/* install */
	hash_add(unbound_pool_hash, &pool->hash_node, hash);    //把新的这个属性的线程池也挂到unbound_pool_hash的hash表上
 
	return pool;
fail:
	if (pool)
		put_unbound_pool(pool);
	return NULL;
}

per cpu的workqueue的pool workqueue对应的线程池也是per cpu的，每个cpu有两个线程池（normal和high priority），因此将pool workqueue和thread pool对应起来是非常简单的事情。对于unbound workqueue，对应关系没有那么直接，如果属性相同，多个unbound workqueue的pool workqueue可能对应一个thread pool。

系统使用哈希表来保存所有的unbound worker thread pool，定义如下：

 /* PL: hash of all unbound pools keyed by pool->attrs */
static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);

在创建unbound workqueue的时候，pool workqueue对应的worker thread pool需要在这个哈希表中搜索，如果有相同属性的worker thread pool的话，那么就不需要创建新的线程池，代码如下：

 	/* do we already have a matching pool?  有相同属相的,则不需要再创建了新的线程池 */
	hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
		if (wqattrs_equal(pool->attrs, attrs)) {
			pool->refcnt++;
			return pool;
		}
	}

分析6：各个node分配pool workqueue并初始化

kernel/workqueue.c

static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
		      const struct workqueue_attrs *attrs,
		      const cpumask_var_t unbound_cpumask)
{
... 	
/*
	 * If something goes wrong during CPU up/down, we'll fall back to
	 * the default pwq covering whole @attrs->cpumask.  Always create
	 * it even if we don't use it immediately.
	 */
	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);    //分配default pool workqueue
	if (!ctx->dfl_pwq)
		goto out_free;
 
     //遍历node 
	for_each_node(node) {       
		if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {    //是否使用default pool wq 
			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);    该node使用自己的pool wq 
			if (!ctx->pwq_tbl[node])
				goto out_free;
		} else {
			ctx->dfl_pwq->refcnt++;              //默认的使用计数
			ctx->pwq_tbl[node] = ctx->dfl_pwq;   //该node使用default pool wq 
		}
	}
 ...
}

分析7：

所有的node的pool workqueue及其worker thread pool已经ready，需要安装到workqueue中。

kernel/workqueue.c

  
/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
{
	int node;
 
	/* all pwqs have been created successfully, let's install'em */
	mutex_lock(&ctx->wq->mutex);
 
	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
	/* save the previous pwq and install the new one */
	for_each_node(node)
		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
							  ctx->pwq_tbl[node]);
 
	/* @dfl_pwq might not have been used, ensure it's linked */
	link_pwq(ctx->dfl_pwq);
	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
	mutex_unlock(&ctx->wq->mutex);
}