nvme: submit internal commands through the block layer

Use block layer queues with an internal cmd_type to submit internally generated NVMe commands. This both simplifies the code a lot and allow for a better structure. For example now the LighNVM code can construct commands without knowing the details of the underlying I/O descriptors. Or a future NVMe over network target could inject commands, as well as could the SCSI translation and ioctl code be reused for such a beast. Signed-off-by: N Christoph Hellwig <hch@lst.de> Signed-off-by: N Jens Axboe <axboe@fb.com>

nvme: submit internal commands through the block layer
Use block layer queues with an internal cmd_type to submit internally generated NVMe commands. This both simplifies the code a lot and allow for a better structure. For example now the LighNVM code can construct commands without knowing the details of the underlying I/O descriptors. Or a future NVMe over network target could inject commands, as well as could the SCSI translation and ioctl code be reused for such a beast. Signed-off-by: N Christoph Hellwig <hch@lst.de> Signed-off-by: N Jens Axboe <axboe@fb.com>
d29ec824 · Christoph Hellwig · Jens Axboe · 772ce435 · d29ec824 · d29ec824
Showing with 300 addition and 545 deletion

drivers/block/nvme-core.c drivers/block/nvme-core.c +177 -222

drivers/block/nvme-scsi.c drivers/block/nvme-scsi.c +114 -308

include/linux/nvme.h include/linux/nvme.h +9 -15

未找到文件。
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -445,7 +445,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
 				(unsigned long) rq, gfp);
 }
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = dev->page_size / 8 - 1;
 	int i;
@@ -605,7 +605,12 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 			spin_unlock_irqrestore(req->q->queue_lock, flags);
 			return;
 		}
-		req->errors = nvme_error_status(status);
+		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
+			req->sense_len = le32_to_cpup(&cqe->result);
+			req->errors = status;
+		} else {
+			req->errors = nvme_error_status(status);
+		}
 	} else
 		req->errors = 0;
@@ -630,8 +635,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx,
 }
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
-								gfp_t gfp)
+		int total_len, gfp_t gfp)
 {
 	struct dma_pool *pool;
 	int length = total_len;
@@ -709,6 +714,23 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
 	return total_len;
 }
+static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req,
+		struct nvme_iod *iod)
+{
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+	memcpy(cmnd, req->cmd, sizeof(struct nvme_command));
+	cmnd->rw.command_id = req->tag;
+	if (req->nr_phys_segments) {
+		cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+		cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+	}
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+}
 /*
 * We reuse the small pool to allocate the 16-byte range here as it is not
 * worth having a special pool for these or additional cases to handle freeing
@@ -807,11 +829,15 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	return 0;
 }
+/*
+ * NOTE: ns is NULL when called on the admin queue.
+ */
 static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			 const struct blk_mq_queue_data *bd)
 {
 	struct nvme_ns *ns = hctx->queue->queuedata;
 	struct nvme_queue *nvmeq = hctx->driver_data;
+	struct nvme_dev *dev = nvmeq->dev;
 	struct request *req = bd->rq;
 	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
 	struct nvme_iod *iod;
@@ -822,7 +848,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	 * unless this namespace is formated such that the metadata can be
 	 * stripped/generated by the controller with PRACT=1.
 	 */
-	if (ns->ms && !blk_integrity_rq(req)) {
+	if (ns && ns->ms && !blk_integrity_rq(req)) {
 		if (!(ns->pi_type && ns->ms == 8)) {
 			req->errors = -EFAULT;
 			blk_mq_complete_request(req);
@@ -830,7 +856,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		}
 	}
-	iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
+	iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
 	if (!iod)
 		return BLK_MQ_RQ_QUEUE_BUSY;
@@ -841,8 +867,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 		 * as it is not worth having a special pool for these or
 		 * additional cases to handle freeing the iod.
 		 */
-		range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+		range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC,
-						GFP_ATOMIC,
 						&iod->first_dma);
 		if (!range)
 			goto retry_cmd;
@@ -860,9 +885,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 			goto retry_cmd;
 		if (blk_rq_bytes(req) !=
-                    nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
+                    nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
-			dma_unmap_sg(nvmeq->dev->dev, iod->sg,
+			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
-					iod->nents, dma_dir);
 			goto retry_cmd;
 		}
 		if (blk_integrity_rq(req)) {
@@ -884,7 +908,9 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	nvme_set_info(cmd, iod, req_completion);
 	spin_lock_irq(&nvmeq->q_lock);
-	if (req->cmd_flags & REQ_DISCARD)
+	if (req->cmd_type == REQ_TYPE_DRV_PRIV)
+		nvme_submit_priv(nvmeq, req, iod);
+	else if (req->cmd_flags & REQ_DISCARD)
 		nvme_submit_discard(nvmeq, ns, req, iod);
 	else if (req->cmd_flags & REQ_FLUSH)
 		nvme_submit_flush(nvmeq, ns, req->tag);
@@ -896,10 +922,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 error_cmd:
-	nvme_free_iod(nvmeq->dev, iod);
+	nvme_free_iod(dev, iod);
 	return BLK_MQ_RQ_QUEUE_ERROR;
 retry_cmd:
-	nvme_free_iod(nvmeq->dev, iod);
+	nvme_free_iod(dev, iod);
 	return BLK_MQ_RQ_QUEUE_BUSY;
 }
@@ -942,15 +968,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
 	return 1;
 }
-/* Admin queue isn't initialized as a request queue. If at some point this
- * happens anyway, make sure to notify the user */
-static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
-			       const struct blk_mq_queue_data *bd)
-{
-	WARN_ON_ONCE(1);
-	return BLK_MQ_RQ_QUEUE_ERROR;
-}
 static irqreturn_t nvme_irq(int irq, void *data)
 {
 	irqreturn_t result;
@@ -972,59 +989,61 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
 	return IRQ_WAKE_THREAD;
 }
-struct sync_cmd_info {
-	struct task_struct *task;
-	u32 result;
-	int status;
-};
-static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
-						struct nvme_completion *cqe)
-{
-	struct sync_cmd_info *cmdinfo = ctx;
-	cmdinfo->result = le32_to_cpup(&cqe->result);
-	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
-	wake_up_process(cmdinfo->task);
-}
 /*
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */
-static int __nvme_submit_sync_cmd(struct request_queue *q,
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
-		struct nvme_command *cmd, u32 *result, unsigned timeout)
+		void *buffer, void __user *ubuffer, unsigned bufflen,
+		u32 *result, unsigned timeout)
 {
-	struct sync_cmd_info cmdinfo;
+	bool write = cmd->common.opcode & 1;
-	struct nvme_cmd_info *cmd_rq;
+	struct bio *bio = NULL;
 	struct request *req;
-	int res;
+	int ret;
-	req = blk_mq_alloc_request(q, WRITE, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
-	cmdinfo.task = current;
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
-	cmdinfo.status = -EINTR;
+	req->__data_len = 0;
+	req->__sector = (sector_t) -1;
+	req->bio = req->biotail = NULL;
-	cmd->common.command_id = req->tag;
+	req->timeout = ADMIN_TIMEOUT;
-	cmd_rq = blk_mq_rq_to_pdu(req);
+	req->cmd = (unsigned char *)cmd;
-	nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
+	req->cmd_len = sizeof(struct nvme_command);
+	req->sense = NULL;
+	req->sense_len = 0;
-	set_current_state(TASK_UNINTERRUPTIBLE);
+	if (buffer && bufflen) {
-	nvme_submit_cmd(cmd_rq->nvmeq, cmd);
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT);
-	schedule();
+		if (ret)
+			goto out;
+	} else if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT);
+		if (ret)
+			goto out;
+		bio = req->bio;
+	}
+	blk_execute_rq(req->q, NULL, req, 0);
+	if (bio)
+		blk_rq_unmap_user(bio);
 	if (result)
-		*result = cmdinfo.result;
+		*result = req->sense_len;
-	res = cmdinfo.status;
+	ret = req->errors;
+ out:
 	blk_mq_free_request(req);
-	return res;
+	return ret;
 }
-int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd)
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
 {
-	return __nvme_submit_sync_cmd(q, cmd, NULL, 0);
+	return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0);
 }
 static int nvme_submit_async_admin_req(struct nvme_dev *dev)
@@ -1081,7 +1100,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 	c.delete_queue.opcode = opcode;
 	c.delete_queue.qid = cpu_to_le16(id);
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
@@ -1090,6 +1109,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+	/*
+	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * is attached to the request.
+	 */
 	memset(&c, 0, sizeof(c));
 	c.create_cq.opcode = nvme_admin_create_cq;
 	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
@@ -1098,7 +1121,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
 	c.create_cq.cq_flags = cpu_to_le16(flags);
 	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
@@ -1107,6 +1130,10 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	struct nvme_command c;
 	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+	/*
+	 * Note: we (ab)use the fact the the prp fields survive if no data
+	 * is attached to the request.
+	 */
 	memset(&c, 0, sizeof(c));
 	c.create_sq.opcode = nvme_admin_create_sq;
 	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
@@ -1115,7 +1142,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
 	c.create_sq.sq_flags = cpu_to_le16(flags);
 	c.create_sq.cqid = cpu_to_le16(qid);
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0);
 }
 static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
@@ -1128,18 +1155,43 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
-int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id)
-							dma_addr_t dma_addr)
 {
-	struct nvme_command c;
+	struct nvme_command c = {
+		.identify.opcode = nvme_admin_identify,
+		.identify.cns = cpu_to_le32(1),
+	};
+	int error;
-	memset(&c, 0, sizeof(c));
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
-	c.identify.opcode = nvme_admin_identify;
+	if (!*id)
-	c.identify.nsid = cpu_to_le32(nsid);
+		return -ENOMEM;
-	c.identify.prp1 = cpu_to_le64(dma_addr);
-	c.identify.cns = cpu_to_le32(cns);
-	return nvme_submit_sync_cmd(dev->admin_q, &c);
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
+		struct nvme_id_ns **id)
+{
+	struct nvme_command c = {
+		.identify.opcode = nvme_admin_identify,
+		.identify.nsid = cpu_to_le32(nsid),
+	};
+	int error;
+	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ns));
+	if (error)
+		kfree(*id);
+	return error;
 }
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
@@ -1153,7 +1205,8 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 	c.features.prp1 = cpu_to_le64(dma_addr);
 	c.features.fid = cpu_to_le32(fid);
-	return __nvme_submit_sync_cmd(dev->admin_q, &c, result, 0);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+			result, 0);
 }
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
@@ -1167,7 +1220,30 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
 	c.features.fid = cpu_to_le32(fid);
 	c.features.dword11 = cpu_to_le32(dword11);
-	return __nvme_submit_sync_cmd(dev->admin_q, &c, result, 0);
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0,
+			result, 0);
+}
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log)
+{
+	struct nvme_command c = {
+		.common.opcode = nvme_admin_get_log_page,
+		.common.nsid = cpu_to_le32(0xFFFFFFFF),
+		.common.cdw10[0] = cpu_to_le32(
+			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+			 NVME_LOG_SMART),
+	};
+	int error;
+	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+	if (!*log)
+		return -ENOMEM;
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+			sizeof(struct nvme_smart_log));
+	if (error)
+		kfree(*log);
+	return error;
 }
 /**
@@ -1523,7 +1599,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev)
 }
 static struct blk_mq_ops nvme_mq_admin_ops = {
-	.queue_rq	= nvme_admin_queue_rq,
+	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
 	.exit_hctx	= nvme_exit_hctx,
@@ -1644,122 +1720,41 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
-				unsigned long addr, unsigned length)
-{
-	int i, err, count, nents, offset;
-	struct scatterlist *sg;
-	struct page **pages;
-	struct nvme_iod *iod;
-	if (addr & 3)
-		return ERR_PTR(-EINVAL);
-	if (!length || length > INT_MAX - PAGE_SIZE)
-		return ERR_PTR(-EINVAL);
-	offset = offset_in_page(addr);
-	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
-	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
-	if (!pages)
-		return ERR_PTR(-ENOMEM);
-	err = get_user_pages_fast(addr, count, 1, pages);
-	if (err < count) {
-		count = err;
-		err = -EFAULT;
-		goto put_pages;
-	}
-	err = -ENOMEM;
-	iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
-	if (!iod)
-		goto put_pages;
-	sg = iod->sg;
-	sg_init_table(sg, count);
-	for (i = 0; i < count; i++) {
-		sg_set_page(&sg[i], pages[i],
-			    min_t(unsigned, length, PAGE_SIZE - offset),
-			    offset);
-		length -= (PAGE_SIZE - offset);
-		offset = 0;
-	}
-	sg_mark_end(&sg[i - 1]);
-	iod->nents = count;
-	nents = dma_map_sg(dev->dev, sg, count,
-				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	if (!nents)
-		goto free_iod;
-	kfree(pages);
-	return iod;
- free_iod:
-	kfree(iod);
- put_pages:
-	for (i = 0; i < count; i++)
-		put_page(pages[i]);
-	kfree(pages);
-	return ERR_PTR(err);
-}
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			struct nvme_iod *iod)
-{
-	int i;
-	dma_unmap_sg(dev->dev, iod->sg, iod->nents,
-				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
-	for (i = 0; i < iod->nents; i++)
-		put_page(sg_page(&iod->sg[i]));
-}
 static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 {
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_user_io io;
 	struct nvme_command c;
-	unsigned length, meta_len, prp_len;
+	unsigned length, meta_len;
 	int status, write;
-	struct nvme_iod *iod;
 	dma_addr_t meta_dma = 0;
 	void *meta = NULL;
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
-	length = (io.nblocks + 1) << ns->lba_shift;
-	meta_len = (io.nblocks + 1) * ns->ms;
-	if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext)
-		return -EINVAL;
-	else if (meta_len && ns->ext) {
-		length += meta_len;
-		meta_len = 0;
-	}
-	write = io.opcode & 1;
 	switch (io.opcode) {
 	case nvme_cmd_write:
 	case nvme_cmd_read:
 	case nvme_cmd_compare:
-		iod = nvme_map_user_pages(dev, write, io.addr, length);
 		break;
 	default:
 		return -EINVAL;
 	}
-	if (IS_ERR(iod))
+	length = (io.nblocks + 1) << ns->lba_shift;
-		return PTR_ERR(iod);
+	meta_len = (io.nblocks + 1) * ns->ms;
+	write = io.opcode & 1;
-	prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
-	if (length != prp_len) {
-		status = -ENOMEM;
-		goto unmap;
-	}
 	if (meta_len) {
+		if (((io.metadata & 3) || !io.metadata) && !ns->ext)
+			return -EINVAL;
+		if (ns->ext) {
+			length += meta_len;
+			meta_len = 0;
+		}
 		meta = dma_alloc_coherent(dev->dev, meta_len,
 						&meta_dma, GFP_KERNEL);
 		if (!meta) {
@@ -1786,13 +1781,11 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-	c.rw.prp2 = cpu_to_le64(iod->first_dma);
 	c.rw.metadata = cpu_to_le64(meta_dma);
-	status = nvme_submit_sync_cmd(ns->queue, &c);
+	status = __nvme_submit_sync_cmd(ns->queue, &c, NULL,
+			(void __user *)io.addr, length, NULL, 0);
 unmap:
-	nvme_unmap_user_pages(dev, write, iod);
-	nvme_free_iod(dev, iod);
 	if (meta) {
 		if (status == NVME_SC_SUCCESS && !write) {
 			if (copy_to_user((void __user *)io.metadata, meta,
@@ -1809,9 +1802,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 {
 	struct nvme_passthru_cmd cmd;
 	struct nvme_command c;
-	int status, length;
+	unsigned timeout = 0;
-	struct nvme_iod *uninitialized_var(iod);
+	int status;
-	unsigned timeout;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1831,38 +1823,17 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
 	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
 	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
-	length = cmd.data_len;
+	if (cmd.timeout_ms)
-	if (cmd.data_len) {
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
-		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
-								length);
-		if (IS_ERR(iod))
-			return PTR_ERR(iod);
-		length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
-		c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
-		c.common.prp2 = cpu_to_le64(iod->first_dma);
-	}
-	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
-								ADMIN_TIMEOUT;
-	if (length != cmd.data_len) {
-		status = -ENOMEM;
-		goto out;
-	}
 	status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c,
-					&cmd.result, timeout);
+			NULL, (void __user *)cmd.addr, cmd.data_len,
+			&cmd.result, timeout);
-out:
+	if (status >= 0) {
-	if (cmd.data_len) {
+		if (put_user(cmd.result, &ucmd->result))
-		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
+			return -EFAULT;
-		nvme_free_iod(dev, iod);
 	}
-	if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
-							sizeof(cmd.result)))
-		status = -EFAULT;
 	return status;
 }
@@ -1954,22 +1925,14 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	struct nvme_ns *ns = disk->private_data;
 	struct nvme_dev *dev = ns->dev;
 	struct nvme_id_ns *id;
-	dma_addr_t dma_addr;
 	u8 lbaf, pi_type;
 	u16 old_ms;
 	unsigned short bs;
-	id = dma_alloc_coherent(dev->dev, 4096, &dma_addr, GFP_KERNEL);
+	if (nvme_identify_ns(dev, ns->ns_id, &id)) {
-	if (!id) {
+		dev_warn(dev->dev, "%s: Identify failure\n", __func__);
-		dev_warn(dev->dev, "%s: Memory alocation failure\n", __func__);
 		return 0;
 	}
-	if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
-		dev_warn(dev->dev,
-			"identify failed ns:%d, setting capacity to 0\n",
-			ns->ns_id);
-		memset(id, 0, sizeof(*id));
-	}
 	old_ms = ns->ms;
 	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
@@ -2010,7 +1973,7 @@ static int nvme_revalidate_disk(struct gendisk *disk)
 	if (dev->oncs & NVME_CTRL_ONCS_DSM)
 		nvme_config_discard(ns);
-	dma_free_coherent(dev->dev, 4096, id, dma_addr);
+	kfree(id);
 	return 0;
 }
@@ -2250,22 +2213,14 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	int res;
 	unsigned nn, i;
 	struct nvme_id_ctrl *ctrl;
-	void *mem;
-	dma_addr_t dma_addr;
 	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
-	mem = dma_alloc_coherent(dev->dev, 4096, &dma_addr, GFP_KERNEL);
+	res = nvme_identify_ctrl(dev, &ctrl);
-	if (!mem)
-		return -ENOMEM;
-	res = nvme_identify(dev, 0, 1, dma_addr);
 	if (res) {
 		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
-		dma_free_coherent(dev->dev, 4096, mem, dma_addr);
 		return -EIO;
 	}
-	ctrl = mem;
 	nn = le32_to_cpup(&ctrl->nn);
 	dev->oncs = le16_to_cpup(&ctrl->oncs);
 	dev->abort_limit = ctrl->acl + 1;
@@ -2287,7 +2242,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 		} else
 			dev->max_hw_sectors = max_hw_sectors;
 	}
-	dma_free_coherent(dev->dev, 4096, mem, dma_addr);
+	kfree(ctrl);
 	dev->tagset.ops = &nvme_mq_ops;
 	dev->tagset.nr_hw_queues = dev->online_queues - 1;

--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -146,21 +146,15 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
 	return (sector >> (ns->lba_shift - 9));
 }
-/**
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
- * nvme_free_iod - frees an nvme_iod
+		void *buf, unsigned bufflen);
- * @dev: The device that the I/O was submitted to
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
- * @iod: The memory to free
+		void *buffer, void __user *ubuffer, unsigned bufflen,
- */
+		u32 *result, unsigned timeout);
-void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
+int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id);
+int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid,
-int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t);
+		struct nvme_id_ns **id);
-struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log);
-				unsigned long addr, unsigned length);
-void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
-			struct nvme_iod *iod);
-int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd);
-int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
-							dma_addr_t dma_addr);
 int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 			dma_addr_t dma_addr, u32 *result);
 int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,