提交 1d3650f7 编写于 作者: T Tejun Heo

cfq-iosched: implement hierarchy-ready cfq_group charge scaling

Currently, cfqg charges are scaled directly according to cfqg->weight.
Regardless of the number of active cfqgs or the amount of active
weights, a given weight value always scales charge the same way.  This
works fine as long as all cfqgs are treated equally regardless of
their positions in the hierarchy, which is what cfq currently
implements.  It can't work in hierarchical settings because the
interpretation of a given weight value depends on where the weight is
located in the hierarchy.

This patch reimplements cfqg charge scaling so that it can be used to
support hierarchy properly.  The scheme is fairly simple and
light-weight.

* When a cfqg is added to the service tree, v(disktime)weight is
  calculated.  It walks up the tree to root calculating the fraction
  it has in the hierarchy.  At each level, the fraction can be
  calculated as

    cfqg->weight / parent->level_weight

  By compounding these, the global fraction of vdisktime the cfqg has
  claim to - vfraction - can be determined.

* When the cfqg needs to be charged, the charge is scaled inversely
  proportionally to the vfraction.

The new scaling scheme uses the same CFQ_SERVICE_SHIFT for fixed point
representation as before; however, the smallest scaling factor is now
1 (ie. 1 << CFQ_SERVICE_SHIFT).  This is different from before where 1
was for CFQ_WEIGHT_DEFAULT and higher weight would result in smaller
scaling factor.

While this shifts the global scale of vdisktime a bit, it doesn't
change the relative relationships among cfqgs and the scheduling
result isn't different.

cfq_group_notify_queue_add uses fixed CFQ_IDLE_DELAY when appending
new cfqg to the service tree.  The specific value of CFQ_IDLE_DELAY
didn't have any relevance to vdisktime before and is unlikely to cause
any visible behavior difference now especially as the scale shift
isn't that large.

As the new scheme now makes proper distinction between cfqg->weight
and ->leaf_weight, reverse the weight aliasing for root cfqgs.  For
root, both weights are now mapped to ->leaf_weight instead of the
other way around.

Because we're still using cfqg_flat_parent(), this patch shouldn't
change the scheduling behavior in any noticeable way.

v2: Beefed up comments on vfraction as requested by Vivek.
Signed-off-by: NTejun Heo <tj@kernel.org>
Acked-by: NVivek Goyal <vgoyal@redhat.com>
上级 7918ffb5
...@@ -236,6 +236,18 @@ struct cfq_group { ...@@ -236,6 +236,18 @@ struct cfq_group {
int nr_active; int nr_active;
unsigned int children_weight; unsigned int children_weight;
/*
* vfraction is the fraction of vdisktime that the tasks in this
* cfqg are entitled to. This is determined by compounding the
* ratios walking up from this cfqg to the root.
*
* It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
* vfractions on a service tree is approximately 1. The sum may
* deviate a bit due to rounding errors and fluctuations caused by
* cfqgs entering and leaving the service tree.
*/
unsigned int vfraction;
/* /*
* There are two weights - (internal) weight is the weight of this * There are two weights - (internal) weight is the weight of this
* cfqg against the sibling cfqgs. leaf_weight is the wight of * cfqg against the sibling cfqgs. leaf_weight is the wight of
...@@ -891,13 +903,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) ...@@ -891,13 +903,27 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
} }
static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) /**
* cfqg_scale_charge - scale disk time charge according to cfqg weight
* @charge: disk time being charged
* @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
*
* Scale @charge according to @vfraction, which is in range (0, 1]. The
* scaling is inversely proportional.
*
* scaled = charge / vfraction
*
* The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
*/
static inline u64 cfqg_scale_charge(unsigned long charge,
unsigned int vfraction)
{ {
u64 d = delta << CFQ_SERVICE_SHIFT; u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
d = d * CFQ_WEIGHT_DEFAULT; /* charge / vfraction */
do_div(d, cfqg->weight); c <<= CFQ_SERVICE_SHIFT;
return d; do_div(c, vfraction);
return c;
} }
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
...@@ -1237,7 +1263,9 @@ cfq_update_group_weight(struct cfq_group *cfqg) ...@@ -1237,7 +1263,9 @@ cfq_update_group_weight(struct cfq_group *cfqg)
static void static void
cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{ {
unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
struct cfq_group *pos = cfqg; struct cfq_group *pos = cfqg;
struct cfq_group *parent;
bool propagate; bool propagate;
/* add to the service tree */ /* add to the service tree */
...@@ -1248,22 +1276,34 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) ...@@ -1248,22 +1276,34 @@ cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
st->total_weight += cfqg->weight; st->total_weight += cfqg->weight;
/* /*
* Activate @cfqg and propagate activation upwards until we meet an * Activate @cfqg and calculate the portion of vfraction @cfqg is
* already activated node or reach root. * entitled to. vfraction is calculated by walking the tree
* towards the root calculating the fraction it has at each level.
* The compounded ratio is how much vfraction @cfqg owns.
*
* Start with the proportion tasks in this cfqg has against active
* children cfqgs - its leaf_weight against children_weight.
*/ */
propagate = !pos->nr_active++; propagate = !pos->nr_active++;
pos->children_weight += pos->leaf_weight; pos->children_weight += pos->leaf_weight;
vfr = vfr * pos->leaf_weight / pos->children_weight;
while (propagate) { /*
struct cfq_group *parent = cfqg_flat_parent(pos); * Compound ->weight walking up the tree. Both activation and
* vfraction calculation are done in the same loop. Propagation
if (!parent) * stops once an already activated node is met. vfraction
break; * calculation should always continue to the root.
*/
while ((parent = cfqg_flat_parent(pos))) {
if (propagate) {
propagate = !parent->nr_active++; propagate = !parent->nr_active++;
parent->children_weight += pos->weight; parent->children_weight += pos->weight;
}
vfr = vfr * pos->weight / parent->children_weight;
pos = parent; pos = parent;
} }
cfqg->vfraction = max_t(unsigned, vfr, 1);
} }
static void static void
...@@ -1309,6 +1349,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) ...@@ -1309,6 +1349,7 @@ cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
/* @pos has 0 nr_active at this point */ /* @pos has 0 nr_active at this point */
WARN_ON_ONCE(pos->children_weight); WARN_ON_ONCE(pos->children_weight);
pos->vfraction = 0;
if (!parent) if (!parent)
break; break;
...@@ -1381,6 +1422,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, ...@@ -1381,6 +1422,7 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
unsigned int used_sl, charge, unaccounted_sl = 0; unsigned int used_sl, charge, unaccounted_sl = 0;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count; - cfqg->service_tree_idle.count;
unsigned int vfr;
BUG_ON(nr_sync < 0); BUG_ON(nr_sync < 0);
used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
...@@ -1390,10 +1432,15 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, ...@@ -1390,10 +1432,15 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
else if (!cfq_cfqq_sync(cfqq) && !nr_sync) else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice; charge = cfqq->allocated_slice;
/* Can't update vdisktime while group is on service tree */ /*
* Can't update vdisktime while on service tree and cfqg->vfraction
* is valid only while on it. Cache vfr, leave the service tree,
* update vdisktime and go back on. The re-addition to the tree
* will also update the weights as necessary.
*/
vfr = cfqg->vfraction;
cfq_group_service_tree_del(st, cfqg); cfq_group_service_tree_del(st, cfqg);
cfqg->vdisktime += cfq_scale_slice(charge, cfqg); cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
/* If a new weight was requested, update now, off tree */
cfq_group_service_tree_add(st, cfqg); cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */ /* This group is being expired. Save the context */
...@@ -1669,44 +1716,44 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, ...@@ -1669,44 +1716,44 @@ static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
#endif /* CONFIG_DEBUG_BLK_CGROUP */ #endif /* CONFIG_DEBUG_BLK_CGROUP */
static struct cftype cfq_blkcg_files[] = { static struct cftype cfq_blkcg_files[] = {
/* on root, weight is mapped to leaf_weight */
{ {
.name = "weight_device", .name = "weight_device",
.read_seq_string = cfqg_print_weight_device, .flags = CFTYPE_ONLY_ON_ROOT,
.write_string = cfqg_set_weight_device, .read_seq_string = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256, .max_write_len = 256,
}, },
{ {
.name = "weight", .name = "weight",
.read_seq_string = cfq_print_weight, .flags = CFTYPE_ONLY_ON_ROOT,
.write_u64 = cfq_set_weight, .read_seq_string = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight,
}, },
/* on root, leaf_weight is mapped to weight */ /* no such mapping necessary for !roots */
{ {
.name = "leaf_weight_device", .name = "weight_device",
.flags = CFTYPE_ONLY_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfqg_print_weight_device, .read_seq_string = cfqg_print_weight_device,
.write_string = cfqg_set_weight_device, .write_string = cfqg_set_weight_device,
.max_write_len = 256, .max_write_len = 256,
}, },
{ {
.name = "leaf_weight", .name = "weight",
.flags = CFTYPE_ONLY_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfq_print_weight, .read_seq_string = cfq_print_weight,
.write_u64 = cfq_set_weight, .write_u64 = cfq_set_weight,
}, },
/* no such mapping necessary for !roots */
{ {
.name = "leaf_weight_device", .name = "leaf_weight_device",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfqg_print_leaf_weight_device, .read_seq_string = cfqg_print_leaf_weight_device,
.write_string = cfqg_set_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device,
.max_write_len = 256, .max_write_len = 256,
}, },
{ {
.name = "leaf_weight", .name = "leaf_weight",
.flags = CFTYPE_NOT_ON_ROOT,
.read_seq_string = cfq_print_leaf_weight, .read_seq_string = cfq_print_leaf_weight,
.write_u64 = cfq_set_leaf_weight, .write_u64 = cfq_set_leaf_weight,
}, },
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册