cpuset: Introduce new interface for scheduler dynamic affinity

hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA -------------------------------- Add 'prefer_cpus' sysfs and related interface in cgroup cpuset. Signed-off-by: N Hui Tang <tanghui20@huawei.com> Reviewed-by: N Zhang Qiao <zhangqiao22@huawei.com> Reviewed-by: N Chen Hui <judy.chenhui@huawei.com> Reviewed-by: N Chen Hui <judy.chenhui@huawei.com> Signed-off-by: N Yongqiang Liu <liuyongqiang13@huawei.com>

cpuset: Introduce new interface for scheduler dynamic affinity
hulk inclusion category: feature bugzilla: 187173, https://gitee.com/openeuler/kernel/issues/I5G4IH CVE: NA -------------------------------- Add 'prefer_cpus' sysfs and related interface in cgroup cpuset. Signed-off-by: N Hui Tang <tanghui20@huawei.com> Reviewed-by: N Zhang Qiao <zhangqiao22@huawei.com> Reviewed-by: N Chen Hui <judy.chenhui@huawei.com> Reviewed-by: N Chen Hui <judy.chenhui@huawei.com> Signed-off-by: N Yongqiang Liu <liuyongqiang13@huawei.com>
243865da · Hui Tang · Yongqiang Liu · 5cabb5b5 · 243865da · 243865da
5 changed file
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1247,7 +1247,16 @@ struct task_struct {
 #else
 	KABI_RESERVE(5)
 #endif
+
+#if !defined(__GENKSYMS__)
+#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
+	cpumask_t			*prefer_cpus;
+#else
+	KABI_RESERVE(6)
+#endif
+#else
 	KABI_RESERVE(6)
+#endif
 	KABI_RESERVE(7)
 	KABI_RESERVE(8)

@@ -1964,4 +1973,12 @@ static inline int sched_qos_cpu_overload(void)
 }
 #endif

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int dynamic_affinity_enabled(void);
+int set_prefer_cpus_ptr(struct task_struct *p,
+			const struct cpumask *new_mask);
+int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
+void sched_prefer_cpus_free(struct task_struct *p);
+#endif
+
 #endif
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -180,6 +180,9 @@ struct task_struct init_task
 #ifdef CONFIG_SECURITY
 	.security	= NULL,
 #endif
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	.prefer_cpus	= NULL,
+#endif
 #ifdef CONFIG_PID_RESERVE
 	.fork_pid_union = {
 		.fork_pid = 0,

--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -104,6 +104,9 @@ struct cpuset {
 	/* user-configured CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t cpus_allowed;
 	nodemask_t mems_allowed;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_var_t prefer_cpus;
+#endif

 	/* effective CPUs and Memory Nodes allow to tasks */
 	cpumask_var_t effective_cpus;
@@ -436,11 +439,22 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
 		goto free_cs;
 	if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
 		goto free_cpus;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!alloc_cpumask_var(&trial->prefer_cpus, GFP_KERNEL))
+		goto free_prefer_cpus;
+#endif

 	cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
 	cpumask_copy(trial->effective_cpus, cs->effective_cpus);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(trial->prefer_cpus, cs->prefer_cpus);
+#endif
 	return trial;

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+free_prefer_cpus:
+	free_cpumask_var(trial->effective_cpus);
+#endif
 free_cpus:
 	free_cpumask_var(trial->cpus_allowed);
 free_cs:
@@ -456,6 +470,9 @@ static void free_trial_cpuset(struct cpuset *trial)
 {
 	free_cpumask_var(trial->effective_cpus);
 	free_cpumask_var(trial->cpus_allowed);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	free_cpumask_var(trial->prefer_cpus);
+#endif
 	kfree(trial);
 }

@@ -487,6 +504,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)

 	rcu_read_lock();

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	ret = -EINVAL;
+	if (!cpumask_subset(cur->prefer_cpus, trial->cpus_allowed))
+		goto out;
+#endif
 	/* Each of our child cpusets must be a subset of us */
 	ret = -EBUSY;
 	cpuset_for_each_child(c, css, cur)
@@ -551,6 +573,66 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	return ret;
 }

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+static cpumask_var_t prefer_cpus_attach;
+
+static void update_tasks_prefer_cpumask(struct cpuset *cs)
+{
+	struct css_task_iter it;
+	struct task_struct *task;
+
+	css_task_iter_start(&cs->css, 0, &it);
+	while ((task = css_task_iter_next(&it)))
+		set_prefer_cpus_ptr(task, cs->prefer_cpus);
+	css_task_iter_end(&it);
+}
+
+/*
+ * update_prefer_cpumask - update the prefer_cpus mask of a cpuset and
+ *			   all tasks in it
+ * @cs: the cpuset to consider
+ * @trialcs: trial cpuset
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_prefer_cpumask(struct cpuset *cs, struct cpuset *trialcs,
+				 const char *buf)
+{
+	int retval;
+
+	if (cs == &top_cpuset)
+		return -EACCES;
+
+	/*
+	 * An empty prefer_cpus is ok which mean that the cpuset tasks disable
+	 * dynamic affinity feature.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.
+	 */
+	if (!*buf) {
+		cpumask_clear(trialcs->prefer_cpus);
+	} else {
+		retval = cpulist_parse(buf, trialcs->prefer_cpus);
+		if (retval < 0)
+			return retval;
+	}
+
+	/* Nothing to do if the cpus didn't change */
+	if (cpumask_equal(cs->prefer_cpus, trialcs->prefer_cpus))
+		return 0;
+
+	if (!cpumask_subset(trialcs->prefer_cpus, cs->cpus_allowed))
+		return -EINVAL;
+
+	update_tasks_prefer_cpumask(trialcs);
+
+	spin_lock_irq(&callback_lock);
+	cpumask_copy(cs->prefer_cpus, trialcs->prefer_cpus);
+	spin_unlock_irq(&callback_lock);
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_SMP
 /*
 * Helper routine for generate_sched_domains().
@@ -1543,6 +1625,10 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 	else
 		guarantee_online_cpus(cs, cpus_attach);

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(prefer_cpus_attach, cs->prefer_cpus);
+#endif
+
 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);

 	cgroup_taskset_for_each(task, css, tset) {
@@ -1551,6 +1637,9 @@ static void cpuset_attach(struct cgroup_taskset *tset)
 		 * fail.  TODO: have a better way to handle failure here
 		 */
 		WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+		set_prefer_cpus_ptr(task, prefer_cpus_attach);
+#endif

 		cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
 		cpuset_update_task_spread_flag(cs, task);
@@ -1610,6 +1699,9 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	FILE_DYNAMIC_CPULIST,
+#endif
 } cpuset_filetype_t;

 static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -1735,6 +1827,11 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	case FILE_MEMLIST:
 		retval = update_nodemask(cs, trialcs, buf);
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		retval = update_prefer_cpumask(cs, trialcs, buf);
+		break;
+#endif
 	default:
 		retval = -EINVAL;
 		break;
@@ -1778,6 +1875,11 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
 	case FILE_EFFECTIVE_MEMLIST:
 		seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
 		break;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	case FILE_DYNAMIC_CPULIST:
+		seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->prefer_cpus));
+		break;
+#endif
 	default:
 		ret = -EINVAL;
 	}
@@ -1935,7 +2037,15 @@ static struct cftype files[] = {
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE_ENABLED,
 	},
-
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	{
+		.name = "preferred_cpus",
+		.seq_show = cpuset_common_seq_show,
+		.write = cpuset_write_resmask,
+		.max_write_len = (100U + 6 * NR_CPUS),
+		.private = FILE_DYNAMIC_CPULIST,
+	},
+#endif
 	{ }	/* terminate */
 };

@@ -1959,17 +2069,28 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
 		goto free_cs;
 	if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
 		goto free_cpus;
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!alloc_cpumask_var(&cs->prefer_cpus, GFP_KERNEL))
+		goto free_effective_cpus;
+#endif

 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
 	cpumask_clear(cs->effective_cpus);
 	nodes_clear(cs->effective_mems);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_clear(cs->prefer_cpus);
+#endif
 	fmeter_init(&cs->fmeter);
 	cs->relax_domain_level = -1;

 	return &cs->css;

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+free_effective_cpus:
+	free_cpumask_var(cs->effective_cpus);
+#endif
 free_cpus:
 	free_cpumask_var(cs->cpus_allowed);
 free_cs:
@@ -2034,6 +2155,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_copy(cs->prefer_cpus, parent->prefer_cpus);
+#endif
 	spin_unlock_irq(&callback_lock);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
@@ -2065,6 +2189,9 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
 {
 	struct cpuset *cs = css_cs(css);

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	free_cpumask_var(cs->prefer_cpus);
+#endif
 	free_cpumask_var(cs->effective_cpus);
 	free_cpumask_var(cs->cpus_allowed);
 	kfree(cs);
@@ -2099,6 +2226,9 @@ static void cpuset_fork(struct task_struct *task)
 		return;

 	set_cpus_allowed_ptr(task, &current->cpus_allowed);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	set_prefer_cpus_ptr(task, current->prefer_cpus);
+#endif
 	task->mems_allowed = current->mems_allowed;
 }

@@ -2129,11 +2259,17 @@ int __init cpuset_init(void)

 	BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
 	BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&top_cpuset.prefer_cpus, GFP_KERNEL));
+#endif

 	cpumask_setall(top_cpuset.cpus_allowed);
 	nodes_setall(top_cpuset.mems_allowed);
 	cpumask_setall(top_cpuset.effective_cpus);
 	nodes_setall(top_cpuset.effective_mems);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_clear(top_cpuset.prefer_cpus);
+#endif

 	fmeter_init(&top_cpuset.fmeter);
 	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2144,6 +2280,9 @@ int __init cpuset_init(void)
 		return err;

 	BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	BUG_ON(!alloc_cpumask_var(&prefer_cpus_attach, GFP_KERNEL));
+#endif

 	return 0;
 }
@@ -2180,6 +2319,9 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 			    struct cpumask *new_cpus, nodemask_t *new_mems,
 			    bool cpus_updated, bool mems_updated)
 {
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	cpumask_t prefer_cpus;
+#endif
 	bool is_empty;

 	spin_lock_irq(&callback_lock);
@@ -2198,6 +2340,13 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
 	if (mems_updated && !nodes_empty(cs->mems_allowed))
 		update_tasks_nodemask(cs);

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	if (!cpumask_subset(cs->prefer_cpus, cs->cpus_allowed)) {
+		cpumask_and(&prefer_cpus, cs->prefer_cpus, cs->cpus_allowed);
+		cpumask_copy(cs->prefer_cpus, &prefer_cpus);
+		update_tasks_prefer_cpumask(cs);
+	}
+#endif
 	is_empty = cpumask_empty(cs->cpus_allowed) ||
 		   nodes_empty(cs->mems_allowed);


--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -459,6 +459,9 @@ void free_task(struct task_struct *tsk)
 	arch_release_task_struct(tsk);
 	if (tsk->flags & PF_KTHREAD)
 		free_kthread_struct(tsk);
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	sched_prefer_cpus_free(tsk);
+#endif
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -888,6 +891,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->seccomp.filter = NULL;
 #endif

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	tsk->prefer_cpus = NULL;
+#endif
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
@@ -1862,6 +1869,12 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval < 0)
 		goto bad_fork_free;

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	retval = sched_prefer_cpus_fork(p, current);
+	if (retval)
+		goto bad_fork_free;
+#endif
+
 	/*
 	 * If multiple threads are within copy_process(), then this check
 	 * triggers too late. This doesn't hurt, the check is only there

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7191,6 +7191,101 @@ static int __maybe_unused cpu_period_quota_parse(char *buf,
 	return 0;
 }

+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig)
+{
+	p->prefer_cpus = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+	if (!p->prefer_cpus)
+		return -ENOMEM;
+
+	if (orig->prefer_cpus)
+		cpumask_copy(p->prefer_cpus, orig->prefer_cpus);
+	else
+		cpumask_clear(p->prefer_cpus);
+
+	return 0;
+}
+
+void sched_prefer_cpus_free(struct task_struct *p)
+{
+	kfree(p->prefer_cpus);
+}
+
+static void do_set_prefer_cpus(struct task_struct *p,
+				const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+	bool queued, running;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	queued = task_on_rq_queued(p);
+	running = task_current(rq, p);
+
+	if (queued) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+	}
+	if (running)
+		put_prev_task(rq, p);
+
+	cpumask_copy(p->prefer_cpus, new_mask);
+
+	if (queued)
+		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
+	if (running)
+		set_curr_task(rq, p);
+}
+
+/*
+ * Change a given task's prefer CPU affinity. Prioritize migrate the thread to
+ * prefer cpus according to preferred bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_prefer_cpus_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
+{
+	struct rq_flags rf;
+	struct rq *rq;
+	int ret = 0;
+
+	if (unlikely(!p->prefer_cpus))
+		return -EINVAL;
+
+	rq = task_rq_lock(p, &rf);
+	update_rq_clock(rq);
+
+	if (cpumask_equal(p->prefer_cpus, new_mask))
+		goto out;
+
+	if (!cpumask_subset(new_mask, &p->cpus_allowed)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	do_set_prefer_cpus(p, new_mask);
+out:
+	task_rq_unlock(rq, p, &rf);
+
+	return ret;
+}
+
+int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	if (p->sched_class != &fair_sched_class)
+		return 0;
+
+	return __set_prefer_cpus_ptr(p, new_mask, false);
+}
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
 static int cpu_max_show(struct seq_file *sf, void *v)
 {