cpuset: Introduce new interface for scheduler dynamic affinity

hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Add 'prefer_cpus' sysfs and related interface in cgroup cpuset. Signed-off-by: N tanghui <tanghui20@huawei.com> Signed-off-by: N Zheng Zucheng <zhengzucheng@huawei.com> Reviewed-by: N Zhang Qiao <zhangqiao22@huawei.com>

cpuset: Introduce new interface for scheduler dynamic affinity
hulk inclusion category: feature bugzilla: 186575, https://gitee.com/openeuler/kernel/issues/I526XC -------------------------------- Add 'prefer_cpus' sysfs and related interface in cgroup cpuset. Signed-off-by: N tanghui <tanghui20@huawei.com> Signed-off-by: N Zheng Zucheng <zhengzucheng@huawei.com> Reviewed-by: N Zhang Qiao <zhangqiao22@huawei.com>
ebeb84ad · tanghui · Zhong Jinghua · 229f5c1f · ebeb84ad · ebeb84ad
6 changed file
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3251,6 +3251,76 @@ static const struct file_operations proc_setgroups_operations = {
 };
 #endif /* CONFIG_USER_NS */

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+
+static int preferred_cpuset_show(struct seq_file *m, void *v)
+{
+	struct inode *inode = m->private;
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (p->prefer_cpus)
+		seq_printf(m, "%*pbl\n", cpumask_pr_args(p->prefer_cpus));
+	else
+		seq_putc(m, '\n');
+
+	put_task_struct(p);
+
+	return 0;
+}
+
+static ssize_t preferred_cpuset_write(struct file *file, const char __user *buf,
+					size_t count, loff_t *offset)
+{
+	cpumask_var_t new_mask;
+	int retval;
+	struct inode *inode = file_inode(file);
+	struct task_struct *p;
+
+	p = get_proc_task(inode);
+	if (!p)
+		return -ESRCH;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+
+	retval = cpumask_parselist_user(buf, count, new_mask);
+	if (retval < 0)
+		goto out_free_cpumask;
+
+	retval = set_prefer_cpus_ptr(p, new_mask);
+	if (retval < 0)
+		goto out_free_cpumask;
+
+	retval = count;
+
+out_free_cpumask:
+	free_cpumask_var(new_mask);
+out_put_task:
+	put_task_struct(p);
+
+	return retval;
+}
+
+static int preferred_cpuset_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, preferred_cpuset_show, inode);
+}
+
+static const struct file_operations proc_preferred_cpuset_operations = {
+	.open		= preferred_cpuset_open,
+	.write		= preferred_cpuset_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task)
 {
@@ -3820,6 +3890,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_BPF_SCHED
 	REG("tag", 0644, proc_pid_tag_operations),
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	REG("preferred_cpuset", 0644, proc_preferred_cpuset_operations),
+#endif
 };

 static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1422,7 +1422,11 @@ struct task_struct {
 	KABI_RESERVE(6)
 #endif
 	KABI_USE(7, void *pf_io_worker)
+#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) && !defined(__GENKSYMS__)
+	KABI_USE(8, cpumask_t *prefer_cpus)
+#else
 	KABI_RESERVE(8)
+#endif
 	KABI_RESERVE(9)
 	KABI_RESERVE(10)
 	KABI_RESERVE(11)
@@ -2206,6 +2210,13 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int set_prefer_cpus_ptr(struct task_struct *p,
+			const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask);
+void sched_prefer_cpus_free(struct task_struct *p);
+#endif
+
 #ifdef CONFIG_BPF_SCHED
 extern void sched_settag(struct task_struct *tsk, s64 tag);


--- a/init/init_task.c
+++ b/init/init_task.c
@@ -214,6 +214,9 @@ struct task_struct init_task
 #ifdef CONFIG_SECURITY
 	.security	= NULL,
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	.prefer_cpus	= NULL,
+#endif
 #ifdef CONFIG_SECCOMP_FILTER
 	.seccomp	= { .filter_count = ATOMIC_INIT(0) },
 #endif

--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -107,6 +107,9 @@ struct cpuset {
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
 	nodemask_t mems_allowed;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif

 	/* effective CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t effective_cpus;
@@ -193,6 +196,9 @@ struct cpuset {
 struct tmpmasks {
 	cpumask_var_t addmask, delmask;	/* For partition root */
 	cpumask_var_t new_cpus;		/* For update_cpumasks_hier() */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif
 };

 static inline struct cpuset *css_cs(struct cgroup_subsys_state *css)
@@ -472,15 +478,24 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	cpumask_var_t *pmask1, *pmask2, *pmask3;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t *pmask4;
+#endif

 	if (cs) {
 		pmask1 = &cs->cpus_allowed;
 		pmask2 = &cs->effective_cpus;
 		pmask3 = &cs->subparts_cpus;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		pmask4 = &cs->prefer_cpus;
+#endif
 	} else {
 		pmask1 = &tmp->new_cpus;
 		pmask2 = &tmp->addmask;
 		pmask3 = &tmp->delmask;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		pmask4 = &tmp->prefer_cpus;
+#endif
 	}

 	if (!zalloc_cpumask_var(pmask1, GFP_KERNEL))
@@ -491,9 +506,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)

 	if (!zalloc_cpumask_var(pmask3, GFP_KERNEL))
 		goto free_two;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!zalloc_cpumask_var(pmask4, GFP_KERNEL))
+		goto free_three;
+#endif

 	return 0;

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+free_three:
+	free_cpumask_var(*pmask3);
+#endif
 free_two:
 	free_cpumask_var(*pmask2);
 free_one:
@@ -509,11 +532,17 @@ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp)
 {
 	if (cs) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		free_cpumask_var(cs->prefer_cpus);
+#endif
 		free_cpumask_var(cs->cpus_allowed);
 		free_cpumask_var(cs->effective_cpus);
 		free_cpumask_var(cs->subparts_cpus);
 	}
 	if (tmp) {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		free_cpumask_var(tmp->prefer_cpus);
+#endif
 		free_cpumask_var(tmp->new_cpus);
 		free_cpumask_var(tmp->addmask);
 		free_cpumask_var(tmp->delmask);
@@ -537,6 +566,9 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 		return NULL;
 	}

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(trial->prefer_cpus, cs->prefer_cpus);
+#endif
 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
 	return trial;
@@ -580,6 +612,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)

 	rcu_read_lock();

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	ret = -EINVAL;
+	if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
+		goto out;
+#endif
 	/* Each of our child cpusets must be a subset of us */
 	ret = -EBUSY;
 	cpuset_for_each_child(c, css, cur)
@@ -644,6 +681,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	return ret;
 }

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static cpumask_var_t prefer_cpus_attach;
+
+static void update_tasks_prefer_cpumask(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, 0, &it);
+	while ((task = css_task_iter_next(&it)))
+		set_prefer_cpus_ptr(task, cs->prefer_cpus);
+	css_task_iter_end(&it);
+}
+
+/*
+ * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
+ *			   all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+				 const char *buf)
+{
+	int retval;
+
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	/*
+	 * An empty prefer_cpus is ok which mean that the cpuset tasks disable
+	 * dynamic affinity feature.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.
+	 */
+	if (!*buf) {
+		cpumask_clear(trialcs->prefer_cpus);
+	} else {
+		retval = cpulist_parse(buf, trialcs->prefer_cpus);
+		if (retval < 0)
+			return retval;
+	}
+
+	/* Nothing to do if the cpus didn't change */
+	if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
+		return 0;
+
+	if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
+		return -EINVAL;
+
+	update_tasks_prefer_cpumask(trialcs);
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
+	spin_unlock_irq(&callback_lock);
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SMP
 /*
 * Helper routine for generate_sched_domains().
@@ -2229,6 +2326,10 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	else
 		guarantee_online_cpus(cs, cpus_attach);

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
+#endif
+
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

 	cgroup_taskset_for_each(task, css, tset) {
@@ -2237,6 +2338,9 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 * fail.  TODO: have a better way to handle failure here
 		 */
 		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		set_prefer_cpus_ptr(task, prefer_cpus_attach);
+#endif

 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
@@ -2297,6 +2401,9 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	FILE_DYNAMIC_CPULIST,
+#endif
 } cpuset_filetype_t;

 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -2427,6 +2534,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, trialcs, buf);
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		retval = update_prefer_cpumask(cs, trialcs, buf);
+		break;
+#endif
 	default:
 		retval = -EINVAL;
 		break;
@@ -2474,6 +2586,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_SUBPARTS_CPULIST:
 		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->subparts_cpus));
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
+		break;
+#endif
 	default:
 		ret = -EINVAL;
 	}
@@ -2681,7 +2798,15 @@ static struct cftype legacy_files[] = {
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE_ENABLED,
 	},
-
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.name = "preferred_cpus",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_DYNAMIC_CPULIST,
+	},
+#endif
 	{ }	/* terminate */
 };

@@ -2830,6 +2955,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(cs->prefer_cpus, parent->prefer_cpus);
+#endif
 	spin_unlock_irq(&callback_lock);
 out_unlock:
 	percpu_up_write(&cpuset_rwsem);
@@ -2912,6 +3040,9 @@ static void cpuset_fork(struct task_struct *task)
 		return;

 	set_cpus_allowed_ptr(task, current->cpus_ptr);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	set_prefer_cpus_ptr(task, current->prefer_cpus);
+#endif
 	task->mems_allowed = current->mems_allowed;
 }

@@ -2945,17 +3076,26 @@ int __init cpuset_init(void)
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
 	BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif

 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	nodes_setall(top_cpuset.effective_mems);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_clear(top_cpuset.prefer_cpus);
+#endif

 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
 	top_cpuset.relax_domain_level = -1;

 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL));
+#endif

 	return 0;
 }
@@ -2992,6 +3132,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 			    struct cpumask *new_cpus, nodemask_t *new_mems,
 			    bool cpus_updated, bool mems_updated)
 {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_t prefer_cpus;
+#endif
 	bool is_empty;

 	spin_lock_irq(&callback_lock);
@@ -3010,6 +3153,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	if (mems_updated && !nodes_empty(cs->mems_allowed))
 		update_tasks_nodemask(cs);

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
+		cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
+		cpumask_copy(cs->prefer_cpus, &prefer_cpus);
+		update_tasks_prefer_cpumask(cs);
+	}
+#endif
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		   nodes_empty(cs->mems_allowed);


--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -468,6 +468,9 @@ void free_task(struct task_struct *tsk)
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	sched_prefer_cpus_free(tsk);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -929,6 +932,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->seccomp.filter = NULL;
 #endif

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	tsk->prefer_cpus = NULL;
+#endif
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
@@ -2047,6 +2054,12 @@ static __latent_entropy struct task_struct *copy_process(

 	rt_mutex_init_task(p);

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	retval = sched_prefer_cpus_fork(p, current->prefer_cpus);
+	if (retval)
+		goto bad_fork_free;
+#endif
+
 	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9763,6 +9763,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
 	return 0;
 }

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int sched_prefer_cpus_fork(struct task_struct *p, struct cpumask *mask)
+{
+	p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	if (!p->prefer_cpus)
+		return -ENOMEM;
+
+	if (mask)
+		cpumask_copy(p->prefer_cpus, mask);
+	else
+		cpumask_clear(p->prefer_cpus);
+
+	return 0;
+}
+
+void sched_prefer_cpus_free(struct task_struct *p)
+{
+	kfree(p->prefer_cpus);
+}
+
+static void do_set_prefer_cpus(struct task_struct *p,
+				const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->__lock);
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
+	cpumask_copy(p->prefer_cpus, new_mask);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_next_task(rq, p);
+}
+
+/*
+ * Change a given task's prefer CPU affinity. Prioritize migrate the thread to
+ * prefer cpus according to preferred bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!p->prefer_cpus))
+		return -EINVAL;
+
+	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+
+	if (cpumask_equal(p->prefer_cpus, new_mask))
+		goto out;
+
+	if (!cpumask_subset(new_mask, p->cpus_ptr)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_prefer_cpus(p, new_mask);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	return ret;
+}
+
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	return __set_prefer_cpus_ptr(p, new_mask, false);
+}
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static int cpu_max_show(struct seq_file *sf, void *v)
 {