diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index f2235a16252951c32b2a2ad151d92ac814b18e02..fdf7dff3f607d96bd03a896d92ba585b2a2d5ab0 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt @@ -392,8 +392,10 @@ Put simply, it costs less to balance between two smaller sched domains than one big one, but doing so means that overloads in one of the two domains won't be load balanced to the other one. -By default, there is one sched domain covering all CPUs, except those -marked isolated using the kernel boot time "isolcpus=" argument. +By default, there is one sched domain covering all CPUs, including those +marked isolated using the kernel boot time "isolcpus=" argument. However, +the isolated CPUs will not participate in load balancing, and will not +have tasks running on them unless explicitly assigned. This default load balancing across all CPUs is not well suited for the following two situations: @@ -465,6 +467,10 @@ such partially load balanced cpusets, as they may be artificially constrained to some subset of the CPUs allowed to them, for lack of load balancing to the other CPUs. +CPUs in "cpuset.isolcpus" were excluded from load balancing by the +isolcpus= kernel boot option, and will never be load balanced regardless +of the value of "cpuset.sched_load_balance" in any cpuset. + 1.7.1 sched_load_balance implementation details. ------------------------------------------------ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3f3308824fa41b473ac77e2927cfdec626bd8c0b..f74d4cc3a3e54f72026449a8f70ff287dc807248 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -337,6 +337,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern cpumask_var_t cpu_isolated_map; + extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) diff --git a/init/main.c b/init/main.c index 6f0f1c5ff8cc82c41b36f107f68732003c5b1dc3..4a6974e678396d73189f4629537475bc9f3aee42 100644 --- a/init/main.c +++ b/init/main.c @@ -654,8 +654,8 @@ asmlinkage __visible void __init start_kernel(void) page_writeback_init(); proc_root_init(); nsfs_init(); - cgroup_init(); cpuset_init(); + cgroup_init(); taskstats_init_early(); delayacct_init(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e3f50a425f1dbbf187d5d1fb4542f..a220fdb66568eea7de9768197d2e275a04ee479b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) static void pidlist_free(void *p) { - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); + kvfree(p); } /* @@ -5040,6 +5037,9 @@ int __init cgroup_init(void) WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } + + if (ss->bind) + ss->bind(init_css_set.subsys[ssid]); } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a9fe017bd9a42353db7a4095240d2..c68f0721df10e4ca71621e63cf33b088539dce76 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) + goto done; + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.effective_cpus); + cpumask_and(doms[0], top_cpuset.effective_cpus, + non_isolated_cpus); goto done; } @@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && - !is_sched_load_balance(cp)) + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) continue; if (is_sched_load_balance(cp)) @@ -748,6 +755,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); + cpumask_and(dp, dp, non_isolated_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -760,6 +768,7 @@ static int generate_sched_domains(cpumask_var_t **domains, BUG_ON(nslot != ndoms); done: + free_cpumask_var(non_isolated_cpus); kfree(csa); /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 261af7bfcb67d55e9ec9c19f657453f3e2607803..2f7937ee9e3a6504d310122600cbdf9d7dc7d183 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -306,6 +306,9 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) {