diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37fc2dbfa005ff607890df3a8bbc6b..c75ea0b8ec59c999123e391ec0094f52de40d12a 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. or other driver-specific files in the Documentation/watchdog/ directory. + workqueue.disable_numa + By default, all work items queued to unbound + workqueues are affine to the NUMA nodes they're + issued on, which results in better behavior in + general. If NUMA affinity needs to be disabled for + whatever reason, this option can be used. Note + that this also can be controlled per-workqueue for + workqueues visible under /sys/bus/workqueue/. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 835d12b769601ad87360b723b754c136978643a0..7179756393781e3346d5a6ee7445403a63590f3f 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -119,10 +119,15 @@ struct delayed_work { /* * A struct for workqueue attributes. This can be used to change * attributes of an unbound workqueue. + * + * Unlike other fields, ->no_numa isn't a property of a worker_pool. It + * only modifies how apply_workqueue_attrs() select pools and thus doesn't + * participate in pool hash calculations or equality comparisons. */ struct workqueue_attrs { int nice; /* nice level */ cpumask_var_t cpumask; /* allowed CPUs */ + bool no_numa; /* disable NUMA affinity */ }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 57cd77de4a4fdd8ead040bffab82f2e5614ecbb8..729ac6a448605feb2e981371db5fda5ece3b9d82 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * first_pwq - return the first pool_workqueue of the specified workqueue - * @wq: the target workqueue - * - * This must be called either with wq->mutex held or sched RCU read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - */ -static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) -{ - assert_rcu_or_wq_mutex(wq); - return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, - pwqs_node); -} - /** * unbound_pwq_by_node - return the unbound pool_workqueue for the given node * @wq: the target workqueue @@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = { __ATTR_NULL, }; -static ssize_t wq_pool_id_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); - struct worker_pool *pool; - int written; + const char *delim = ""; + int node, written = 0; rcu_read_lock_sched(); - pool = first_pwq(wq)->pool; - written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock_sched(); return written; @@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev, return ret ?: count; } +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; @@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { - if (!wq_numa_enabled) + if (!wq_numa_enabled || attrs->no_numa) goto use_dfl; /* does @node have any online CPUs @attrs wants? */ @@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, cpumask = target_attrs->cpumask; mutex_lock(&wq->mutex); + if (wq->unbound_attrs->no_numa) + goto out_unlock; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); @@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; + if (wq_disable_numa) { + pr_info("workqueue: NUMA affinity support disabled\n"); + return; + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); BUG_ON(!wq_update_unbound_numa_attrs_buf);