提交 14a40ffc 编写于 作者: T Tejun Heo

sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY

PF_THREAD_BOUND was originally used to mark kernel threads which were
bound to a specific CPU using kthread_bind() and a task with the flag
set allows cpus_allowed modifications only to itself.  Workqueue is
currently abusing it to prevent userland from meddling with
cpus_allowed of workqueue workers.

What we need is a flag to prevent userland from messing with
cpus_allowed of certain kernel tasks.  In kernel, anyone can
(incorrectly) squash the flag, and, for worker-type usages,
restricting cpus_allowed modification to the task itself doesn't
provide meaningful extra proection as other tasks can inject work
items to the task anyway.

This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY.
sched_setaffinity() checks the flag and return -EINVAL if set.
set_cpus_allowed_ptr() is no longer affected by the flag.

This will allow simplifying workqueue worker CPU affinity management.
Signed-off-by: NTejun Heo <tj@kernel.org>
Acked-by: NIngo Molnar <mingo@kernel.org>
Reviewed-by: NLai Jiangshan <laijs@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
上级 2e109a28
...@@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, ...@@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
......
...@@ -2224,11 +2224,11 @@ retry_find_task: ...@@ -2224,11 +2224,11 @@ retry_find_task:
tsk = tsk->group_leader; tsk = tsk->group_leader;
/* /*
* Workqueue threads may acquire PF_THREAD_BOUND and become * Workqueue threads may acquire PF_NO_SETAFFINITY and become
* trapped in a cpuset, or RT worker may be born in a cgroup * trapped in a cpuset, or RT worker may be born in a cgroup
* with no rt_runtime allocated. Just say no. * with no rt_runtime allocated. Just say no.
*/ */
if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL; ret = -EINVAL;
rcu_read_unlock(); rcu_read_unlock();
goto out_unlock_cgroup; goto out_unlock_cgroup;
......
...@@ -1388,16 +1388,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) ...@@ -1388,16 +1388,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, cgrp, tset) { cgroup_taskset_for_each(task, cgrp, tset) {
/* /*
* Kthreads bound to specific cpus cannot be moved to a new * Kthreads which disallow setaffinity shouldn't be moved
* cpuset; we cannot change their cpu affinity and * to a new cpuset; we don't want to change their cpu
* isolating such threads by their set of allowed nodes is * affinity and isolating such threads by their set of
* unnecessary. Thus, cpusets are not applicable for such * allowed nodes is unnecessary. Thus, cpusets are not
* threads. This prevents checking for success of * applicable for such threads. This prevents checking for
* set_cpus_allowed_ptr() on all attached tasks before * success of set_cpus_allowed_ptr() on all attached tasks
* cpus_allowed may be changed. * before cpus_allowed may be changed.
*/ */
ret = -EINVAL; ret = -EINVAL;
if (task->flags & PF_THREAD_BOUND) if (task->flags & PF_NO_SETAFFINITY)
goto out_unlock; goto out_unlock;
ret = security_task_setscheduler(task); ret = security_task_setscheduler(task);
if (ret) if (ret)
......
...@@ -260,7 +260,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) ...@@ -260,7 +260,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
{ {
/* It's safe because the task is inactive. */ /* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu)); do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND; p->flags |= PF_NO_SETAFFINITY;
} }
/** /**
......
...@@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ...@@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
get_task_struct(p); get_task_struct(p);
rcu_read_unlock(); rcu_read_unlock();
if (p->flags & PF_NO_SETAFFINITY) {
retval = -EINVAL;
goto out_put_task;
}
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM; retval = -ENOMEM;
goto out_put_task; goto out_put_task;
...@@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ...@@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out; goto out;
} }
if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
ret = -EINVAL;
goto out;
}
do_set_cpus_allowed(p, new_mask); do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */ /* Can the task run on the task's current CPU? If so, we're done */
......
...@@ -1757,12 +1757,8 @@ static struct worker *create_worker(struct worker_pool *pool) ...@@ -1757,12 +1757,8 @@ static struct worker *create_worker(struct worker_pool *pool)
set_user_nice(worker->task, pool->attrs->nice); set_user_nice(worker->task, pool->attrs->nice);
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
/* /* prevent userland from meddling with cpumask of workqueue workers */
* %PF_THREAD_BOUND is used to prevent userland from meddling with worker->task->flags |= PF_NO_SETAFFINITY;
* cpumask of workqueue workers. This is an abuse. We need
* %PF_NO_SETAFFINITY.
*/
worker->task->flags |= PF_THREAD_BOUND;
/* /*
* The caller is responsible for ensuring %POOL_DISASSOCIATED * The caller is responsible for ensuring %POOL_DISASSOCIATED
...@@ -3876,7 +3872,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, ...@@ -3876,7 +3872,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
} }
wq->rescuer = rescuer; wq->rescuer = rescuer;
rescuer->task->flags |= PF_THREAD_BOUND; rescuer->task->flags |= PF_NO_SETAFFINITY;
wake_up_process(rescuer->task); wake_up_process(rescuer->task);
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册