diff --git a/block/blk-core.c b/block/blk-core.c
index 3b5ee3f7cc1e7c3ac9b65ce251f982399a25f5ec..e25a1c3f8b760d774d316d581d5cb4a7a087469c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1632,6 +1632,31 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
 
+void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
+{
+	struct task_struct *tsk = current;
+
+	/*
+	 * If this is a nested plug, don't actually assign it.
+	 */
+	if (tsk->plug)
+		return;
+
+	INIT_LIST_HEAD(&plug->mq_list);
+	plug->cached_rq = NULL;
+	plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
+	plug->rq_count = 0;
+	plug->multiple_queues = false;
+	plug->nowait = false;
+	INIT_LIST_HEAD(&plug->cb_list);
+
+	/*
+	 * Store ordering should not be needed here, since a potential
+	 * preempt will imply a full memory barrier
+	 */
+	tsk->plug = plug;
+}
+
 /**
  * blk_start_plug - initialize blk_plug and track it inside the task_struct
  * @plug:	The &struct blk_plug that needs to be initialized
@@ -1657,25 +1682,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
  */
 void blk_start_plug(struct blk_plug *plug)
 {
-	struct task_struct *tsk = current;
-
-	/*
-	 * If this is a nested plug, don't actually assign it.
-	 */
-	if (tsk->plug)
-		return;
-
-	INIT_LIST_HEAD(&plug->mq_list);
-	INIT_LIST_HEAD(&plug->cb_list);
-	plug->rq_count = 0;
-	plug->multiple_queues = false;
-	plug->nowait = false;
-
-	/*
-	 * Store ordering should not be needed here, since a potential
-	 * preempt will imply a full memory barrier
-	 */
-	tsk->plug = plug;
+	blk_start_plug_nr_ios(plug, 1);
 }
 EXPORT_SYMBOL(blk_start_plug);
 
@@ -1727,6 +1734,8 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 
 	if (!list_empty(&plug->mq_list))
 		blk_mq_flush_plug_list(plug, from_schedule);
+	if (unlikely(!from_schedule && plug->cached_rq))
+		blk_mq_free_plug_rqs(plug);
 }
 
 /**
diff --git a/block/blk-mq.c b/block/blk-mq.c
index d861a969b2e0aa59ac5e248103526b7077395abb..d9f14d3c2b8cf0080045b13d7436c6b507f1a00f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -359,6 +359,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 	struct request_queue *q = data->q;
 	struct elevator_queue *e = q->elevator;
 	u64 alloc_time_ns = 0;
+	struct request *rq;
 	unsigned int tag;
 
 	/* alloc_time includes depth and tag waits */
@@ -392,10 +393,21 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 	 * case just retry the hctx assignment and tag allocation as CPU hotplug
 	 * should have migrated us to an online CPU by now.
 	 */
-	tag = blk_mq_get_tag(data);
-	if (tag == BLK_MQ_NO_TAG) {
+	do {
+		tag = blk_mq_get_tag(data);
+		if (tag != BLK_MQ_NO_TAG) {
+			rq = blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+			if (!--data->nr_tags)
+				return rq;
+			if (e || data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+				return rq;
+			rq->rq_next = *data->cached_rq;
+			*data->cached_rq = rq;
+			data->flags |= BLK_MQ_REQ_NOWAIT;
+			continue;
+		}
 		if (data->flags & BLK_MQ_REQ_NOWAIT)
-			return NULL;
+			break;
 
 		/*
 		 * Give up the CPU and sleep for a random short time to ensure
@@ -404,8 +416,15 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 		 */
 		msleep(3);
 		goto retry;
+	} while (1);
+
+	if (data->cached_rq) {
+		rq = *data->cached_rq;
+		*data->cached_rq = rq->rq_next;
+		return rq;
 	}
-	return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+
+	return NULL;
 }
 
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
@@ -415,6 +434,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= op,
+		.nr_tags	= 1,
 	};
 	struct request *rq;
 	int ret;
@@ -443,6 +463,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.q		= q,
 		.flags		= flags,
 		.cmd_flags	= op,
+		.nr_tags	= 1,
 	};
 	u64 alloc_time_ns = 0;
 	unsigned int cpu;
@@ -544,6 +565,18 @@ void blk_mq_free_request(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
+void blk_mq_free_plug_rqs(struct blk_plug *plug)
+{
+	while (plug->cached_rq) {
+		struct request *rq;
+
+		rq = plug->cached_rq;
+		plug->cached_rq = rq->rq_next;
+		percpu_ref_get(&rq->q->q_usage_counter);
+		blk_mq_free_request(rq);
+	}
+}
+
 inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
 {
 	u64 now = 0;
@@ -2185,6 +2218,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
 	const int is_flush_fua = op_is_flush(bio->bi_opf);
 	struct blk_mq_alloc_data data = {
 		.q		= q,
+		.nr_tags	= 1,
 	};
 	struct request *rq;
 	struct blk_plug *plug;
@@ -2211,13 +2245,26 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
 
 	hipri = bio->bi_opf & REQ_HIPRI;
 
-	data.cmd_flags = bio->bi_opf;
-	rq = __blk_mq_alloc_request(&data);
-	if (unlikely(!rq)) {
-		rq_qos_cleanup(q, bio);
-		if (bio->bi_opf & REQ_NOWAIT)
-			bio_wouldblock_error(bio);
-		goto queue_exit;
+	plug = blk_mq_plug(q, bio);
+	if (plug && plug->cached_rq) {
+		rq = plug->cached_rq;
+		plug->cached_rq = rq->rq_next;
+		INIT_LIST_HEAD(&rq->queuelist);
+		data.hctx = rq->mq_hctx;
+	} else {
+		data.cmd_flags = bio->bi_opf;
+		if (plug) {
+			data.nr_tags = plug->nr_ios;
+			plug->nr_ios = 1;
+			data.cached_rq = &plug->cached_rq;
+		}
+		rq = __blk_mq_alloc_request(&data);
+		if (unlikely(!rq)) {
+			rq_qos_cleanup(q, bio);
+			if (bio->bi_opf & REQ_NOWAIT)
+				bio_wouldblock_error(bio);
+			goto queue_exit;
+		}
 	}
 
 	trace_block_getrq(bio);
@@ -2236,7 +2283,6 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
 		return BLK_QC_T_NONE;
 	}
 
-	plug = blk_mq_plug(q, bio);
 	if (unlikely(is_flush_fua)) {
 		/* Bypass scheduler for flush requests */
 		blk_insert_flush(rq);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 171e8cdcff5437f22ebe783bbc8b964a8b7f66d8..5da970bb886591c79ca7dd0398f192e32fb6b26a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -125,6 +125,7 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+void blk_mq_free_plug_rqs(struct blk_plug *plug);
 
 void blk_mq_release(struct request_queue *q);
 
@@ -152,6 +153,10 @@ struct blk_mq_alloc_data {
 	unsigned int shallow_depth;
 	unsigned int cmd_flags;
 
+	/* allocate multiple requests/tags in one go */
+	unsigned int nr_tags;
+	struct request **cached_rq;
+
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 75d75657df2123352369597d35f4bbcdccfc7fa7..0e941f2175784d1b474b44702b11284c33ddf1d7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -90,7 +90,10 @@ struct request {
 	struct bio *bio;
 	struct bio *biotail;
 
-	struct list_head queuelist;
+	union {
+		struct list_head queuelist;
+		struct request *rq_next;
+	};
 
 	/*
 	 * The hash is used inside the scheduler, and killed once the
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 472b4ab007c6b6d070e0552ed579d31f4359ff6f..17705c970d7e1871f92811c2460e980267a6fbe0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -722,10 +722,17 @@ extern void blk_set_queue_dying(struct request_queue *);
  */
 struct blk_plug {
 	struct list_head mq_list; /* blk-mq requests */
-	struct list_head cb_list; /* md requires an unplug callback */
+
+	/* if ios_left is > 1, we can batch tag/rq allocations */
+	struct request *cached_rq;
+	unsigned short nr_ios;
+
 	unsigned short rq_count;
+
 	bool multiple_queues;
 	bool nowait;
+
+	struct list_head cb_list; /* md requires an unplug callback */
 };
 
 struct blk_plug_cb;
@@ -738,6 +745,7 @@ struct blk_plug_cb {
 extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug,
 					     void *data, int size);
 extern void blk_start_plug(struct blk_plug *);
+extern void blk_start_plug_nr_ios(struct blk_plug *, unsigned short);
 extern void blk_finish_plug(struct blk_plug *);
 extern void blk_flush_plug_list(struct blk_plug *, bool);
 
@@ -772,6 +780,11 @@ long nr_blockdev_pages(void);
 struct blk_plug {
 };
 
+static inline void blk_start_plug_nr_ios(struct blk_plug *plug,
+					 unsigned short nr_ios)
+{
+}
+
 static inline void blk_start_plug(struct blk_plug *plug)
 {
 }