diff --git a/block/blk-mq.c b/block/blk-mq.c index b29e7dc7b309e4cf939cd2c011b26b38dfd4df73..9e6b064e533979446a936c45c18f500c6f87725b 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -904,6 +904,44 @@ static bool reorder_tags_to_front(struct list_head *list) return first != NULL; } +static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, + void *key) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + + list_del(&wait->task_list); + clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); + blk_mq_run_hw_queue(hctx, true); + return 1; +} + +static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) +{ + struct sbq_wait_state *ws; + + /* + * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. + * The thread which wins the race to grab this bit adds the hardware + * queue to the wait queue. + */ + if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || + test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) + return false; + + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); + + /* + * As soon as this returns, it's no longer safe to fiddle with + * hctx->dispatch_wait, since a completion can wake up the wait queue + * and unlock the bit. + */ + add_wait_queue(&ws->wait, &hctx->dispatch_wait); + return true; +} + bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct request_queue *q = hctx->queue; @@ -931,15 +969,22 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) continue; /* - * We failed getting a driver tag. Mark the queue(s) - * as needing a restart. Retry getting a tag again, - * in case the needed IO completed right before we - * marked the queue as needing a restart. + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. */ - blk_mq_sched_mark_restart(hctx); - if (!blk_mq_get_driver_tag(rq, &hctx, false)) + if (blk_mq_dispatch_wait_add(hctx)) { + /* + * It's possible that a tag was freed in the + * window between the allocation failure and + * adding the hardware queue to the wait queue. + */ + if (!blk_mq_get_driver_tag(rq, &hctx, false)) + break; + } else { break; + } } + list_del_init(&rq->queuelist); bd.rq = rq; @@ -995,10 +1040,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) * * blk_mq_run_hw_queue() already checks the STOPPED bit * - * If RESTART is set, then let completion restart the queue - * instead of potentially looping here. + * If RESTART or TAG_WAITING is set, then let completion restart + * the queue instead of potentially looping here. */ - if (!blk_mq_sched_needs_restart(hctx)) + if (!blk_mq_sched_needs_restart(hctx) && + !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) blk_mq_run_hw_queue(hctx, true); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 8e4df3d6c8cd9dbd1ddd96e25e1abae9a4aa4b6f..001d30d727c56c4d46e5e572ded575ca50cc85ff 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -33,6 +33,7 @@ struct blk_mq_hw_ctx { struct blk_mq_ctx **ctxs; unsigned int nr_ctx; + wait_queue_t dispatch_wait; atomic_t wait_index; struct blk_mq_tags *tags; @@ -160,6 +161,7 @@ enum { BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, + BLK_MQ_S_TAG_WAITING = 3, BLK_MQ_MAX_DEPTH = 10240,