提交 c483f4fe 编写于 作者: Y yu kuai 提交者: Yang Yingliang

blk-mq: use static_rqs instead of rqs to iterate tags

hulk inclusion
category: bugfix
bugzilla: 34280
CVE: NA

---------------------------

tags->rqs[] will not been cleaned when free driver tag to avoid
an extra store on a shared area in the per io path. But there
is a window between get driver tag and write tags->rqs[], so we
may see stale rq in tags->rqs[] which may have been freed, as
the following case:

blk_mq_get_request         blk_mq_queue_tag_busy_iter
  -> blk_mq_get_tag
                             -> bt_for_each
                               -> bt_iter
                                 -> rq = tags->rqs[]
                                 -> rq->q
  -> blk_mq_rq_ctx_init
    -> data->hctx->tags->rqs[rq->tag] = rq;

In additiion, tags->rqs[] only contains the requests that get
driver tag. It is not accurate for io-scheduler case when account
busy tags in part_in_flight.

To fix both of them, the blk_mq_queue_tag_busy_iter is changed
in this patch to use tags->static_rqs[] instead of tags->rqs[].
We have to identify whether there is a io scheduler attached to
decide to use hctx->tags or hctx->sched_tags. And we will try to
get a non-zero q_usage_counter before that, then could avoid race
with update nr_hw_queues, switch io-scheduler and even queue cleanup.

Add 'inflight' parameter to determine to iterate in-flight
requests or just busy tags and add a new helper interface
blk_mq_queue_tag_inflight_iter to iterate all of the in-flight
tags and export this interface for drivers.
Signed-off-by: Nyu kuai <yukuai3@huawei.com>
Reviewed-by: NHou Tao <houtao1@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 f5c88ad3
...@@ -216,37 +216,51 @@ struct bt_iter_data { ...@@ -216,37 +216,51 @@ struct bt_iter_data {
busy_iter_fn *fn; busy_iter_fn *fn;
void *data; void *data;
bool reserved; bool reserved;
bool inflight;
}; };
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{ {
struct bt_iter_data *iter_data = data; struct bt_iter_data *iter_data = data;
struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct blk_mq_hw_ctx *hctx = iter_data->hctx;
struct blk_mq_tags *tags = hctx->tags;
bool reserved = iter_data->reserved; bool reserved = iter_data->reserved;
struct blk_mq_tags *tags;
struct request *rq; struct request *rq;
tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags;
if (!reserved) if (!reserved)
bitnr += tags->nr_reserved_tags; bitnr += tags->nr_reserved_tags;
rq = tags->rqs[bitnr];
/* /*
* We can hit rq == NULL here, because the tagging functions * Because tags->rqs[] will not been cleaned when free driver tag
* test and set the bit before assining ->rqs[]. * and there is a window between get driver tag and write tags->rqs[],
* so we may see stale rq in tags->rqs[] which may have been freed.
* Using static_rqs[] is safer.
*/
rq = tags->static_rqs[bitnr];
/*
* There is a small window between get tag and blk_mq_rq_ctx_init,
* so rq->q and rq->mq_hctx maybe different.
*/ */
if (rq && rq->q == hctx->queue) if (rq && rq->q == hctx->queue &&
(!iter_data->inflight ||
blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT))
iter_data->fn(hctx, rq, iter_data->data, reserved); iter_data->fn(hctx, rq, iter_data->data, reserved);
return true; return true;
} }
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, static void bt_for_each(struct blk_mq_hw_ctx *hctx,
busy_iter_fn *fn, void *data, bool reserved) struct sbitmap_queue *bt, busy_iter_fn *fn,
void *data, bool reserved, bool inflight)
{ {
struct bt_iter_data iter_data = { struct bt_iter_data iter_data = {
.hctx = hctx, .hctx = hctx,
.fn = fn, .fn = fn,
.data = data, .data = data,
.reserved = reserved, .reserved = reserved,
.inflight = inflight,
}; };
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
...@@ -314,22 +328,23 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, ...@@ -314,22 +328,23 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
} }
EXPORT_SYMBOL(blk_mq_tagset_busy_iter); EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, static void __blk_mq_queue_tag_busy_iter(struct request_queue *q,
void *priv) busy_iter_fn *fn, void *priv, bool inflight)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
int i; int i;
/* /*
* __blk_mq_update_nr_hw_queues will update the nr_hw_queues and * Get a reference of the queue unless it has been zero. We use this
* queue_hw_ctx after freeze the queue, so we use q_usage_counter * to avoid the race with the code that would modify the hctxs after
* to avoid race with it. * freeze and drain the queue, including updating nr_hw_queues, io
* scheduler switching and queue clean up.
*/ */
if (!percpu_ref_tryget(&q->q_usage_counter)) if (!percpu_ref_tryget(&q->q_usage_counter))
return; return;
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags; struct blk_mq_tags *tags;
/* /*
* If not software queues are currently mapped to this * If not software queues are currently mapped to this
...@@ -338,13 +353,45 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, ...@@ -338,13 +353,45 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
if (!blk_mq_hw_queue_mapped(hctx)) if (!blk_mq_hw_queue_mapped(hctx))
continue; continue;
tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags;
if (tags->nr_reserved_tags) if (tags->nr_reserved_tags)
bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, &tags->breserved_tags,
bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); fn, priv, true, inflight);
bt_for_each(hctx, &tags->bitmap_tags,
fn, priv, false, inflight);
/*
* flush_rq represents the rq with REQ_PREFLUSH and REQ_FUA
* (if FUA is not supported by device) to be issued to
* device. So we need to consider it when iterate inflight
* rqs, but needn't to count it when iterate busy tags.
*/
if (inflight &&
blk_mq_rq_state(hctx->fq->flush_rq) == MQ_RQ_IN_FLIGHT)
fn(hctx, hctx->fq->flush_rq, priv, false);
} }
blk_queue_exit(q); blk_queue_exit(q);
} }
/*
* Iterate all the busy tags including pending and in-flight ones.
*/
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv)
{
__blk_mq_queue_tag_busy_iter(q, fn, priv, false);
}
/*
* Iterate all the inflight tags.
*/
void blk_mq_queue_tag_inflight_iter(struct request_queue *q,
busy_iter_fn *fn, void *priv)
{
__blk_mq_queue_tag_busy_iter(q, fn, priv, true);
}
EXPORT_SYMBOL(blk_mq_queue_tag_inflight_iter);
static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
bool round_robin, int node) bool round_robin, int node)
{ {
......
...@@ -112,7 +112,7 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, ...@@ -112,7 +112,7 @@ void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part,
struct mq_inflight mi = { .part = part, .inflight = inflight, }; struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0; inflight[0] = inflight[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); blk_mq_queue_tag_inflight_iter(q, blk_mq_check_inflight, &mi);
} }
static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
...@@ -131,7 +131,7 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, ...@@ -131,7 +131,7 @@ void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
struct mq_inflight mi = { .part = part, .inflight = inflight, }; struct mq_inflight mi = { .part = part, .inflight = inflight, };
inflight[0] = inflight[1] = 0; inflight[0] = inflight[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi); blk_mq_queue_tag_inflight_iter(q, blk_mq_check_inflight_rw, &mi);
} }
void blk_freeze_queue_start(struct request_queue *q) void blk_freeze_queue_start(struct request_queue *q)
...@@ -875,7 +875,7 @@ static void blk_mq_timeout_work(struct work_struct *work) ...@@ -875,7 +875,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter)) if (!percpu_ref_tryget(&q->q_usage_counter))
return; return;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); blk_mq_queue_tag_inflight_iter(q, blk_mq_check_expired, &next);
if (next != 0) { if (next != 0) {
mod_timer(&q->timeout, next); mod_timer(&q->timeout, next);
......
...@@ -320,7 +320,8 @@ void blk_freeze_queue_start(struct request_queue *q); ...@@ -320,7 +320,8 @@ void blk_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout); unsigned long timeout);
void blk_mq_queue_tag_inflight_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
int blk_mq_map_queues(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set);
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册