diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3a27d31fcda60250854ced98c68fe3db32ac587c..97337214bec4286afa212cf5f271ce26b578d399 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || - (bfqd->num_active_groups > 0 + (bfqd->num_groups_with_pending_reqs > 0 #endif ); } @@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, */ break; } - bfqd->num_active_groups--; + + /* + * The decrement of num_groups_with_pending_reqs is + * not performed immediately upon the deactivation of + * entity, but it is delayed to when it also happens + * that the first leaf descendant bfqq of entity gets + * all its pending requests completed. The following + * instructions perform this delayed decrement, if + * needed. See the comments on + * num_groups_with_pending_reqs for details. + */ + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + bfqd->num_groups_with_pending_reqs--; + } } } @@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * fact, if there are active groups, then, for condition (i) * to become false, it is enough that an active group contains * more active processes or sub-groups than some other active - * group. We address this issue with the following bi-modal - * behavior, implemented in the function + * group. More precisely, for condition (i) to hold because of + * such a group, it is not even necessary that the group is + * (still) active: it is sufficient that, even if the group + * has become inactive, some of its descendant processes still + * have some request already dispatched but still waiting for + * completion. In fact, requests have still to be guaranteed + * their share of the throughput even after being + * dispatched. In this respect, it is easy to show that, if a + * group frequently becomes inactive while still having + * in-flight requests, and if, when this happens, the group is + * not considered in the calculation of whether the scenario + * is asymmetric, then the group may fail to be guaranteed its + * fair share of the throughput (basically because idling may + * not be performed for the descendant processes of the group, + * but it had to be). We address this issue with the + * following bi-modal behavior, implemented in the function * bfq_symmetric_scenario(). * - * If there are active groups, then the scenario is tagged as + * If there are groups with requests waiting for completion + * (as commented above, some of these groups may even be + * already inactive), then the scenario is tagged as * asymmetric, conservatively, without checking any of the * conditions (i) and (ii). So the device is idled for bfqq. * This behavior matches also the fact that groups are created - * exactly if controlling I/O (to preserve bandwidth and - * latency guarantees) is a primary concern. + * exactly if controlling I/O is a primary concern (to + * preserve bandwidth and latency guarantees). * - * On the opposite end, if there are no active groups, then - * only condition (i) is actually controlled, i.e., provided - * that condition (i) holds, idling is not performed, - * regardless of whether condition (ii) holds. In other words, - * only if condition (i) does not hold, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. Since there - * are no active groups, then, to control condition (i) it is - * enough to check whether all active queues have the same - * weight. + * On the opposite end, if there are no groups with requests + * waiting for completion, then only condition (i) is actually + * controlled, i.e., provided that condition (i) holds, idling + * is not performed, regardless of whether condition (ii) + * holds. In other words, only if condition (i) does not hold, + * then idling is allowed, and the device tends to be + * prevented from queueing many requests, possibly of several + * processes. Since there are no groups with requests waiting + * for completion, then, to control condition (i) it is enough + * to check just whether all the queues with requests waiting + * for completion also have the same weight. * * Not checking condition (ii) evidently exposes bfqq to the * risk of getting less throughput than its fair share. @@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * bfqq is weight-raised is checked explicitly here. More * precisely, the compound condition below takes into account * also the fact that, even if bfqq is being weight-raised, - * the scenario is still symmetric if all active queues happen - * to be weight-raised. Actually, we should be even more - * precise here, and differentiate between interactive weight - * raising and soft real-time weight raising. + * the scenario is still symmetric if all queues with requests + * waiting for completion happen to be + * weight-raised. Actually, we should be even more precise + * here, and differentiate between interactive weight raising + * and soft real-time weight raising. * * As a side note, it is worth considering that the above * device-idling countermeasures may however fail in the @@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->queue_weights_tree = RB_ROOT; - bfqd->num_active_groups = 0; + bfqd->num_groups_with_pending_reqs = 0; INIT_LIST_HEAD(&bfqd->active_list); INIT_LIST_HEAD(&bfqd->idle_list); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 77651d817ecd36fe59827f2aa55f9c4ec5ffb979..0b02bf302de07706fbfdc5b5ef66da2545e01eee 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -196,6 +196,9 @@ struct bfq_entity { /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; + + /* flag, set if the entity is counted in groups_with_pending_reqs */ + bool in_groups_with_pending_reqs; }; struct bfq_group; @@ -448,10 +451,54 @@ struct bfq_data { * bfq_weights_tree_[add|remove] for further details). */ struct rb_root queue_weights_tree; + /* - * number of groups with requests still waiting for completion + * Number of groups with at least one descendant process that + * has at least one request waiting for completion. Note that + * this accounts for also requests already dispatched, but not + * yet completed. Therefore this number of groups may differ + * (be larger) than the number of active groups, as a group is + * considered active only if its corresponding entity has + * descendant queues with at least one request queued. This + * number is used to decide whether a scenario is symmetric. + * For a detailed explanation see comments on the computation + * of the variable asymmetric_scenario in the function + * bfq_better_to_idle(). + * + * However, it is hard to compute this number exactly, for + * groups with multiple descendant processes. Consider a group + * that is inactive, i.e., that has no descendant process with + * pending I/O inside BFQ queues. Then suppose that + * num_groups_with_pending_reqs is still accounting for this + * group, because the group has descendant processes with some + * I/O request still in flight. num_groups_with_pending_reqs + * should be decremented when the in-flight request of the + * last descendant process is finally completed (assuming that + * nothing else has changed for the group in the meantime, in + * terms of composition of the group and active/inactive state of child + * groups and processes). To accomplish this, an additional + * pending-request counter must be added to entities, and must + * be updated correctly. To avoid this additional field and operations, + * we resort to the following tradeoff between simplicity and + * accuracy: for an inactive group that is still counted in + * num_groups_with_pending_reqs, we decrement + * num_groups_with_pending_reqs when the first descendant + * process of the group remains with no request waiting for + * completion. + * + * Even this simpler decrement strategy requires a little + * carefulness: to avoid multiple decrements, we flag a group, + * more precisely an entity representing a group, as still + * counted in num_groups_with_pending_reqs when it becomes + * inactive. Then, when the first descendant queue of the + * entity remains with no request waiting for completion, + * num_groups_with_pending_reqs is decremented, and this flag + * is reset. After this flag is reset for the entity, + * num_groups_with_pending_reqs won't be decremented any + * longer in case a new descendant queue of the entity remains + * with no request waiting for completion. */ - unsigned int num_active_groups; + unsigned int num_groups_with_pending_reqs; /* * Number of bfq_queues containing requests (including the diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4b0d5fb6916005571d4d4b9885e5a24d194e7a7d..63e0f12be7c98fe7770eb9f1e817f5319690c392 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity, container_of(entity, struct bfq_group, entity); struct bfq_data *bfqd = bfqg->bfqd; - bfqd->num_active_groups++; + if (!entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = true; + bfqd->num_groups_with_pending_reqs++; + } } #endif diff --git a/block/blk-mq.c b/block/blk-mq.c index 3262d83b9e0744a15c0434c4f1502ccb93f2dd73..6a7566244de30bb72b6d779c38c9f0b8d65fd616 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1715,15 +1715,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: - /* - * If direct dispatch fails, we cannot allow any merging on - * this IO. Drivers (like SCSI) may have set up permanent state - * for this request, like SG tables and mappings, and if we - * merge to it later on then we'll still only do IO to the - * original part. - */ - rq->cmd_flags |= REQ_NOMERGE; - blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; @@ -1736,18 +1727,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -/* - * Don't allow direct dispatch of anything but regular reads/writes, - * as some of the other commands can potentially share request space - * with data we need for the IO scheduler. If we attempt a direct dispatch - * on those and fail, we can't safely add it to the scheduler afterwards - * without potentially overwriting data that the driver has already written. - */ -static bool blk_rq_can_direct_dispatch(struct request *rq) -{ - return req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE; -} - static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, @@ -1769,7 +1748,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - if (!blk_rq_can_direct_dispatch(rq) || (q->elevator && !bypass_insert)) + if (q->elevator && !bypass_insert) goto insert; if (!blk_mq_get_dispatch_budget(hctx)) @@ -1785,7 +1764,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, if (bypass_insert) return BLK_STS_RESOURCE; - blk_mq_sched_insert_request(rq, false, run_queue, false); + blk_mq_request_bypass_insert(rq, run_queue); return BLK_STS_OK; } @@ -1801,7 +1780,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_sched_insert_request(rq, false, true, false); + blk_mq_request_bypass_insert(rq, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); @@ -1831,15 +1810,13 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct request *rq = list_first_entry(list, struct request, queuelist); - if (!blk_rq_can_direct_dispatch(rq)) - break; - list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq); if (ret != BLK_STS_OK) { if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - list_add(&rq->queuelist, list); + blk_mq_request_bypass_insert(rq, + list_empty(list)); break; } blk_mq_end_request(rq, ret); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3cf1b773158ece0ca99bcfe92f1c37e22afee25b..962012135b62acf7e956df6fc5780780c3f8a9c4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -831,6 +831,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) { struct nvme_ctrl *ctrl = rq->end_io_data; + unsigned long flags; + bool startka = false; blk_mq_free_request(rq); @@ -841,7 +843,13 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) return; } - schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); + spin_lock_irqsave(&ctrl->lock, flags); + if (ctrl->state == NVME_CTRL_LIVE || + ctrl->state == NVME_CTRL_CONNECTING) + startka = true; + spin_unlock_irqrestore(&ctrl->lock, flags); + if (startka) + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } static int nvme_keep_alive(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3f7971d3706d90d5fbf382072ed8d2da2ac6e8b6..583086dd9cb9a07252a46e6589ff9cc76bf99b33 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -529,6 +529,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) { struct nvmet_rdma_rsp *rsp = container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; nvmet_rdma_release_rsp(rsp); @@ -536,7 +537,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) wc->status != IB_WC_WR_FLUSH_ERR)) { pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); - nvmet_rdma_error_comp(rsp->queue); + nvmet_rdma_error_comp(queue); } }