blk-mq: add basic round-robin of what CPU to queue workqueue work on

Right now we just pick the first CPU in the mask, but that can easily overload that one. Add some basic batching and round-robin all the entries in the mask instead. Signed-off-by: N Jens Axboe <axboe@fb.com>

blk-mq: add basic round-robin of what CPU to queue workqueue work on
Right now we just pick the first CPU in the mask, but that can easily overload that one. Add some basic batching and round-robin all the entries in the mask instead. Signed-off-by: N Jens Axboe <axboe@fb.com>
506e931f · Jens Axboe · 5cf8c227 · 506e931f · 506e931f
隐藏空白更改
内联并排

Showing with 35 addition and 14 deletion

block/blk-mq.c block/blk-mq.c +31 -14

include/linux/blk-mq.h include/linux/blk-mq.h +4 -0

未找到文件。
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -670,6 +670,30 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	}
 }
+/*
+ * It'd be great if the workqueue API had a way to pass
+ * in a mask and had some smarts for more clever placement.
+ * For now we just round-robin here, switching for every
+ * BLK_MQ_CPU_WORK_BATCH queued items.
+ */
+static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
+{
+	int cpu = hctx->next_cpu;
+	if (--hctx->next_cpu_batch <= 0) {
+		int next_cpu;
+		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+		if (next_cpu >= nr_cpu_ids)
+			next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu = next_cpu;
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+	}
+	return cpu;
+}
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
@@ -682,13 +706,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	else {
 		unsigned int cpu;
-		/*
+		cpu = blk_mq_hctx_next_cpu(hctx);
-		 * It'd be great if the workqueue API had a way to pass
-		 * in a mask and had some smarts for more clever placement
-		 * than the first CPU. Or we could round-robin here. For now,
-		 * just queue on the first CPU.
-		 */
-		cpu = cpumask_first(hctx->cpumask);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 	}
 }
@@ -795,13 +813,7 @@ void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 	else {
 		unsigned int cpu;
-		/*
+		cpu = blk_mq_hctx_next_cpu(hctx);
-		 * It'd be great if the workqueue API had a way to pass
-		 * in a mask and had some smarts for more clever placement
-		 * than the first CPU. Or we could round-robin here. For now,
-		 * just queue on the first CPU.
-		 */
-		cpu = cpumask_first(hctx->cpumask);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
 	}
 }
@@ -1378,6 +1390,11 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
+	}
 }
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)

--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -21,6 +21,8 @@ struct blk_mq_hw_ctx {
 	struct delayed_work	run_work;
 	struct delayed_work	delay_work;
 	cpumask_var_t		cpumask;
+	int			next_cpu;
+	int			next_cpu_batch;
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
@@ -126,6 +128,8 @@ enum {
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_MAX_DEPTH	= 2048,
+	BLK_MQ_CPU_WORK_BATCH	= 8,
 };
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);