pci.c 81.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
M
Matthew Wilcox 已提交
2 3
/*
 * NVM Express device driver
4
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
5 6
 */

K
Keith Busch 已提交
7
#include <linux/aer.h>
8
#include <linux/async.h>
M
Matthew Wilcox 已提交
9
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
10
#include <linux/blk-mq.h>
11
#include <linux/blk-mq-pci.h>
12
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
13 14 15 16 17
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
18
#include <linux/mutex.h>
19
#include <linux/once.h>
M
Matthew Wilcox 已提交
20
#include <linux/pci.h>
21
#include <linux/suspend.h>
K
Keith Busch 已提交
22
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
23
#include <linux/types.h>
24
#include <linux/io-64-nonatomic-lo-hi.h>
25
#include <linux/sed-opal.h>
26
#include <linux/pci-p2pdma.h>
27

Y
yupeng 已提交
28
#include "trace.h"
29 30
#include "nvme.h"

31
#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
32
#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
33

C
Chaitanya Kulkarni 已提交
34
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
35

36 37 38 39 40 41 42
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

43 44 45
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

46
static bool use_cmb_sqes = true;
47
module_param(use_cmb_sqes, bool, 0444);
48 49
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

50 51 52 53
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
54

C
Chaitanya Kulkarni 已提交
55 56 57 58 59 60
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

61 62 63
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
64
	.get = param_get_uint,
65 66
};

67
static unsigned int io_queue_depth = 1024;
68 69 70
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
	unsigned int n;
	int ret;

	ret = kstrtouint(val, 10, &n);
	if (ret != 0 || n > num_possible_cpus())
		return -EINVAL;
	return param_set_uint(val, kp);
}

static const struct kernel_param_ops io_queue_count_ops = {
	.set = io_queue_count_set,
	.get = param_get_uint,
};

87
static unsigned int write_queues;
88
module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
89 90 91 92
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

93
static unsigned int poll_queues;
94
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
J
Jens Axboe 已提交
95 96
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

97 98
struct nvme_dev;
struct nvme_queue;
99

100
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
101
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
102

103 104 105 106
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
107
	struct nvme_queue *queues;
108 109 110 111 112 113 114 115
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
116
	unsigned io_queues[HCTX_MAX_TYPES];
117
	unsigned int num_vecs;
118
	u16 q_depth;
119
	int io_sqes;
120 121
	u32 db_stride;
	void __iomem *bar;
122
	unsigned long bar_mapped_size;
123
	struct work_struct remove_work;
124
	struct mutex shutdown_lock;
125 126
	bool subsystem;
	u64 cmb_size;
127
	bool cmb_use_sqes;
128
	u32 cmbsz;
129
	u32 cmbloc;
130
	struct nvme_ctrl ctrl;
131
	u32 last_ps;
132

133 134
	mempool_t *iod_mempool;

135
	/* shadow doorbell buffer support: */
136 137 138 139
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
140 141 142 143

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
144
	dma_addr_t host_mem_descs_dma;
145 146
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
147 148 149
	unsigned int nr_allocated_queues;
	unsigned int nr_write_queues;
	unsigned int nr_poll_queues;
K
Keith Busch 已提交
150
};
151

152 153
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
154 155
	int ret;
	u16 n;
156

157
	ret = kstrtou16(val, 10, &n);
158 159 160
	if (ret != 0 || n < 2)
		return -EINVAL;

161
	return param_set_ushort(val, kp);
162 163
}

164 165 166 167 168 169 170 171 172 173
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

174 175 176 177 178
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
179 180 181 182 183
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
M
Matthew Wilcox 已提交
184
	struct nvme_dev *dev;
185
	spinlock_t sq_lock;
186
	void *sq_cmds;
187 188
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
K
Keith Busch 已提交
189
	struct nvme_completion *cqes;
M
Matthew Wilcox 已提交
190 191 192 193
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
194
	u16 cq_vector;
M
Matthew Wilcox 已提交
195 196
	u16 sq_tail;
	u16 cq_head;
K
Keith Busch 已提交
197
	u16 qid;
198
	u8 cq_phase;
199
	u8 sqes;
200 201
	unsigned long flags;
#define NVMEQ_ENABLED		0
202
#define NVMEQ_SQ_CMB		1
203
#define NVMEQ_DELETE_ERROR	2
204
#define NVMEQ_POLLED		3
205 206 207 208
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
209
	struct completion delete_done;
M
Matthew Wilcox 已提交
210 211
};

212
/*
213 214 215 216
 * The nvme_iod describes the data in an I/O.
 *
 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
 * to the actual struct scatterlist.
217 218
 */
struct nvme_iod {
219
	struct nvme_request req;
C
Christoph Hellwig 已提交
220
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
221
	bool use_sgl;
C
Christoph Hellwig 已提交
222
	int aborted;
223 224 225
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	dma_addr_t first_dma;
226
	unsigned int dma_len;	/* length of single DMA segment mapping */
227
	dma_addr_t meta_dma;
C
Christoph Hellwig 已提交
228
	struct scatterlist *sg;
M
Matthew Wilcox 已提交
229 230
};

231
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
232
{
233
	return dev->nr_allocated_queues * 8 * dev->db_stride;
234 235 236 237
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
238
	unsigned int mem_size = nvme_dbbuf_size(dev);
239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
263
	unsigned int mem_size = nvme_dbbuf_size(dev);
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
302
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

329 330 331 332 333 334 335 336
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

337 338 339 340 341
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
342 343
}

344 345 346 347 348
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
349
static int nvme_pci_npages_prp(void)
350
{
351
	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
352
				      NVME_CTRL_PAGE_SIZE);
353 354 355
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
356 357 358 359
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
360
static int nvme_pci_npages_sgl(void)
361
{
362 363
	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
			PAGE_SIZE);
C
Christoph Hellwig 已提交
364
}
365

366
static size_t nvme_pci_iod_alloc_size(void)
C
Christoph Hellwig 已提交
367
{
368
	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
C
Chaitanya Kulkarni 已提交
369

370 371
	return sizeof(__le64 *) * npages +
		sizeof(struct scatterlist) * NVME_MAX_SEGS;
C
Christoph Hellwig 已提交
372
}
373

M
Matias Bjørling 已提交
374 375
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
376
{
M
Matias Bjørling 已提交
377
	struct nvme_dev *dev = data;
378
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
379

380 381 382
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);

M
Matias Bjørling 已提交
383 384
	hctx->driver_data = nvmeq;
	return 0;
385 386
}

M
Matias Bjørling 已提交
387 388
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
389
{
M
Matias Bjørling 已提交
390
	struct nvme_dev *dev = data;
391
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
392

393
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
394 395
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
396 397
}

398 399
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
400
{
401
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
402
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
403
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
404
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
405 406

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
407
	iod->nvmeq = nvmeq;
408 409

	nvme_req(req)->ctrl = &dev->ctrl;
M
Matias Bjørling 已提交
410 411 412
	return 0;
}

413 414 415 416 417 418 419 420 421
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

422 423 424
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
425 426 427 428 429 430 431 432
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
433
			BUG_ON(i == HCTX_TYPE_DEFAULT);
434
			continue;
435 436
		}

J
Jens Axboe 已提交
437 438 439 440
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
441
		map->queue_offset = qoff;
442
		if (i != HCTX_TYPE_POLL && offset)
J
Jens Axboe 已提交
443 444 445
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
446 447 448 449 450
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
451 452
}

K
Keith Busch 已提交
453
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq)
454 455 456 457 458 459
{
	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
}

M
Matthew Wilcox 已提交
460
/**
461
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
462 463
 * @nvmeq: The queue to use
 * @cmd: The command to send
464
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
465
 */
466 467
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
468
{
469
	spin_lock(&nvmeq->sq_lock);
470 471
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
472 473
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
K
Keith Busch 已提交
474 475
	if (write_sq)
		nvme_write_sq_db(nvmeq);
476 477 478 479 480 481 482 483
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
K
Keith Busch 已提交
484
	nvme_write_sq_db(nvmeq);
485
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
486 487
}

C
Chaitanya Kulkarni 已提交
488
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
489
{
C
Christoph Hellwig 已提交
490
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
491
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
492 493
}

494 495 496
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
497
	int nseg = blk_rq_nr_phys_segments(req);
498 499
	unsigned int avg_seg_size;

500
	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
501 502 503 504 505 506 507 508 509 510

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

511
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
512
{
C
Christoph Hellwig 已提交
513
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
514
	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
C
Chaitanya Kulkarni 已提交
515
	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
516 517
	int i;

518
	if (iod->dma_len) {
519 520
		dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
			       rq_dma_dir(req));
521
		return;
522 523
	}

524 525
	WARN_ON_ONCE(!iod->nents);

526 527 528 529
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
				    rq_dma_dir(req));
	else
530 531 532
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));


533
	if (iod->npages == 0)
C
Chaitanya Kulkarni 已提交
534 535 536
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			dma_addr);

537
	for (i = 0; i < iod->npages; i++) {
C
Chaitanya Kulkarni 已提交
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
		void *addr = nvme_pci_iod_list(req)[i];

		if (iod->use_sgl) {
			struct nvme_sgl_desc *sg_list = addr;

			next_dma_addr =
			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
		} else {
			__le64 *prp_list = addr;

			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
		}

		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
		dma_addr = next_dma_addr;
553
	}
554

555
	mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
556 557
}

558 559 560 561 562 563 564 565 566 567 568 569 570 571
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
572 573
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
574
{
C
Christoph Hellwig 已提交
575
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
576
	struct dma_pool *pool;
577
	int length = blk_rq_payload_bytes(req);
578
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
579 580
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
581
	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
582
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
583
	void **list = nvme_pci_iod_list(req);
584
	dma_addr_t prp_dma;
585
	int nprps, i;
M
Matthew Wilcox 已提交
586

587
	length -= (NVME_CTRL_PAGE_SIZE - offset);
588 589
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
590
		goto done;
591
	}
M
Matthew Wilcox 已提交
592

593
	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
594
	if (dma_len) {
595
		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
596 597 598 599 600 601
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

602
	if (length <= NVME_CTRL_PAGE_SIZE) {
603
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
604
		goto done;
605 606
	}

607
	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
608 609
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
610
		iod->npages = 0;
611 612
	} else {
		pool = dev->prp_page_pool;
613
		iod->npages = 1;
614 615
	}

616
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
617
	if (!prp_list) {
618
		iod->first_dma = dma_addr;
619
		iod->npages = -1;
620
		return BLK_STS_RESOURCE;
621
	}
622 623
	list[0] = prp_list;
	iod->first_dma = prp_dma;
624 625
	i = 0;
	for (;;) {
626
		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
627
			__le64 *old_prp_list = prp_list;
628
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
629
			if (!prp_list)
630
				return BLK_STS_RESOURCE;
631
			list[iod->npages++] = prp_list;
632 633 634
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
635 636
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
637 638 639
		dma_len -= NVME_CTRL_PAGE_SIZE;
		dma_addr += NVME_CTRL_PAGE_SIZE;
		length -= NVME_CTRL_PAGE_SIZE;
640 641 642 643
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
644 645
		if (unlikely(dma_len < 0))
			goto bad_sgl;
646 647 648
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
649 650
	}

C
Chaitanya Kulkarni 已提交
651 652 653 654
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);

655 656 657
	return BLK_STS_OK;

 bad_sgl:
658 659 660
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
661
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
662 663
}

C
Chaitanya Kulkarni 已提交
664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
686
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
687 688 689 690 691 692
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
693
	int i = 0;
C
Chaitanya Kulkarni 已提交
694 695 696 697

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

698
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
				return BLK_STS_RESOURCE;

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
739
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
740 741 742 743

	return BLK_STS_OK;
}

744 745 746 747 748
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
749 750
	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
751 752 753 754 755 756 757 758 759

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
	if (bv->bv_len > first_prp_len)
		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
760
	return BLK_STS_OK;
761 762
}

763 764 765 766 767 768 769 770 771 772 773
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

774
	cmnd->flags = NVME_CMD_SGL_METABUF;
775 776 777
	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
778
	return BLK_STS_OK;
779 780
}

781
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
782
		struct nvme_command *cmnd)
783
{
C
Christoph Hellwig 已提交
784
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
785
	blk_status_t ret = BLK_STS_RESOURCE;
786
	int nr_mapped;
787

788 789 790 791
	if (blk_rq_nr_phys_segments(req) == 1) {
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
792
			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
793 794
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
795 796 797 798 799

			if (iod->nvmeq->qid &&
			    dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
				return nvme_setup_sgl_simple(dev, req,
							     &cmnd->rw, &bv);
800 801 802 803
		}
	}

	iod->dma_len = 0;
804 805 806
	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
	if (!iod->sg)
		return BLK_STS_RESOURCE;
807
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
808
	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
C
Christoph Hellwig 已提交
809 810
	if (!iod->nents)
		goto out;
811

812
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
813 814
		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
815 816
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
817
					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
818
	if (!nr_mapped)
C
Christoph Hellwig 已提交
819
		goto out;
820

821
	iod->use_sgl = nvme_pci_use_sgls(dev, req);
822
	if (iod->use_sgl)
823
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
824 825
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
826
out:
827
	if (ret != BLK_STS_OK)
828 829 830
		nvme_unmap_data(dev, req);
	return ret;
}
831

832 833 834 835
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
836

837 838 839 840 841
	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
			rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
		return BLK_STS_IOERR;
	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
842
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
843 844
}

845 846 847
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
848
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
849
			 const struct blk_mq_queue_data *bd)
850
{
M
Matias Bjørling 已提交
851 852
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
853
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
854
	struct request *req = bd->rq;
855
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
856
	struct nvme_command cmnd;
857
	blk_status_t ret;
K
Keith Busch 已提交
858

859 860 861 862
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;

863 864 865 866
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
867
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
868 869
		return BLK_STS_IOERR;

870
	ret = nvme_setup_cmd(ns, req, &cmnd);
871
	if (ret)
C
Christoph Hellwig 已提交
872
		return ret;
M
Matias Bjørling 已提交
873

874
	if (blk_rq_nr_phys_segments(req)) {
875
		ret = nvme_map_data(dev, req, &cmnd);
876
		if (ret)
877
			goto out_free_cmd;
878
	}
M
Matias Bjørling 已提交
879

880 881 882 883 884 885
	if (blk_integrity_rq(req)) {
		ret = nvme_map_metadata(dev, req, &cmnd);
		if (ret)
			goto out_unmap_data;
	}

886
	blk_mq_start_request(req);
887
	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
888
	return BLK_STS_OK;
889 890
out_unmap_data:
	nvme_unmap_data(dev, req);
891 892
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
893
	return ret;
M
Matthew Wilcox 已提交
894
}
K
Keith Busch 已提交
895

896
static void nvme_pci_complete_rq(struct request *req)
897
{
C
Christoph Hellwig 已提交
898
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
899
	struct nvme_dev *dev = iod->nvmeq->dev;
M
Matias Bjørling 已提交
900

901 902 903
	if (blk_integrity_rq(req))
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
904
	if (blk_rq_nr_phys_segments(req))
905
		nvme_unmap_data(dev, req);
906
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
907 908
}

909
/* We read the CQE phase first to check if the rest of the entry is valid */
910
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
911
{
K
Keith Busch 已提交
912 913 914
	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];

	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
915 916
}

917
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
918
{
919
	u16 head = nvmeq->cq_head;
920

921 922 923
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
924
}
925

C
Christoph Hellwig 已提交
926 927 928 929 930 931 932
static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return nvmeq->dev->admin_tagset.tags[0];
	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}

933
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
934
{
K
Keith Busch 已提交
935
	struct nvme_completion *cqe = &nvmeq->cqes[idx];
936
	struct request *req;
937

938 939 940 941 942
	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
			cqe->command_id, le16_to_cpu(cqe->sq_id));
		return;
M
Matthew Wilcox 已提交
943 944
	}

945 946 947 948 949 950
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
951
	if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) {
952 953
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
954
		return;
955
	}
M
Matthew Wilcox 已提交
956

C
Christoph Hellwig 已提交
957
	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
Y
yupeng 已提交
958
	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
959 960
	if (!nvme_end_request(req, cqe->status, cqe->result))
		nvme_pci_complete_rq(req);
961
}
M
Matthew Wilcox 已提交
962

963 964
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
965 966 967
	u16 tmp = nvmeq->cq_head + 1;

	if (tmp == nvmeq->q_depth) {
968
		nvmeq->cq_head = 0;
969
		nvmeq->cq_phase ^= 1;
970 971
	} else {
		nvmeq->cq_head = tmp;
M
Matthew Wilcox 已提交
972
	}
J
Jens Axboe 已提交
973 974
}

975
static inline int nvme_process_cq(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
976
{
977
	int found = 0;
M
Matthew Wilcox 已提交
978

979
	while (nvme_cqe_pending(nvmeq)) {
980
		found++;
981 982 983 984 985
		/*
		 * load-load control dependency between phase and the rest of
		 * the cqe requires a full read memory barrier
		 */
		dma_rmb();
986
		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
987
		nvme_update_cq_head(nvmeq);
988
	}
989

990
	if (found)
991
		nvme_ring_cq_doorbell(nvmeq);
992
	return found;
M
Matthew Wilcox 已提交
993 994 995
}

static irqreturn_t nvme_irq(int irq, void *data)
996 997
{
	struct nvme_queue *nvmeq = data;
998
	irqreturn_t ret = IRQ_NONE;
999

1000 1001 1002 1003 1004
	/*
	 * The rmb/wmb pair ensures we see all updates from a previous run of
	 * the irq handler, even if that was on another CPU.
	 */
	rmb();
1005 1006
	if (nvme_process_cq(nvmeq))
		ret = IRQ_HANDLED;
1007
	wmb();
1008

1009
	return ret;
1010 1011 1012 1013 1014
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1015

1016
	if (nvme_cqe_pending(nvmeq))
1017 1018
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1019 1020
}

1021
/*
1022
 * Poll for completions for any interrupt driven queue
1023 1024
 * Can be called from any context.
 */
1025
static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1026
{
1027
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
J
Jens Axboe 已提交
1028

1029
	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
1030

1031 1032 1033
	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
	nvme_process_cq(nvmeq);
	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
J
Jens Axboe 已提交
1034 1035
}

1036
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1037 1038 1039 1040 1041 1042 1043
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1044
	spin_lock(&nvmeq->cq_poll_lock);
1045
	found = nvme_process_cq(nvmeq);
1046
	spin_unlock(&nvmeq->cq_poll_lock);
1047 1048 1049 1050

	return found;
}

1051
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1052
{
1053
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1054
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1055
	struct nvme_command c;
M
Matthew Wilcox 已提交
1056

M
Matias Bjørling 已提交
1057 1058
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1059
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1060
	nvme_submit_cmd(nvmeq, &c, true);
1061 1062
}

M
Matthew Wilcox 已提交
1063
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1064
{
M
Matthew Wilcox 已提交
1065 1066 1067 1068 1069 1070
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1071
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1072 1073 1074
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1075
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1076 1077
{
	struct nvme_command c;
J
Jens Axboe 已提交
1078 1079
	int flags = NVME_QUEUE_PHYS_CONTIG;

1080
	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
J
Jens Axboe 已提交
1081
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1082

1083
	/*
M
Minwoo Im 已提交
1084
	 * Note: we (ab)use the fact that the prp fields survive if no data
1085 1086
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1087 1088 1089 1090 1091 1092
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
1093
	c.create_cq.irq_vector = cpu_to_le16(vector);
M
Matthew Wilcox 已提交
1094

1095
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1096 1097 1098 1099 1100
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1101
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1102
	struct nvme_command c;
1103
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1104

1105 1106 1107 1108 1109 1110 1111 1112
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1113
	/*
M
Minwoo Im 已提交
1114
	 * Note: we (ab)use the fact that the prp fields survive if no data
1115 1116
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1117 1118 1119 1120 1121 1122 1123 1124
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1125
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1138
static void abort_endio(struct request *req, blk_status_t error)
1139
{
C
Christoph Hellwig 已提交
1140 1141
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1142

1143 1144
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1145 1146
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1147 1148
}

K
Keith Busch 已提交
1149 1150 1151 1152 1153 1154 1155
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1156 1157 1158
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1159
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1160
		return false;
1161 1162 1163
	default:
		break;
	}
K
Keith Busch 已提交
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1192
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1193
{
C
Christoph Hellwig 已提交
1194 1195
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1196
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1197 1198
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1199 1200
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1201 1202 1203 1204 1205 1206 1207
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1208 1209 1210 1211 1212 1213
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1214
		nvme_reset_ctrl(&dev->ctrl);
1215
		return BLK_EH_DONE;
K
Keith Busch 已提交
1216
	}
K
Keith Busch 已提交
1217

K
Keith Busch 已提交
1218 1219 1220
	/*
	 * Did we miss an interrupt?
	 */
1221 1222 1223 1224 1225
	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
		nvme_poll(req->mq_hctx);
	else
		nvme_poll_irqdisable(nvmeq);

1226
	if (blk_mq_request_completed(req)) {
K
Keith Busch 已提交
1227 1228 1229
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1230
		return BLK_EH_DONE;
K
Keith Busch 已提交
1231 1232
	}

1233
	/*
1234 1235 1236
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1237
	 * shutdown, so we return BLK_EH_DONE.
1238
	 */
1239 1240
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
1241 1242 1243
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
		/* fall through */
	case NVME_CTRL_DELETING:
1244
		dev_warn_ratelimited(dev->ctrl.device,
1245 1246
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1247
		nvme_dev_disable(dev, true);
1248
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1249
		return BLK_EH_DONE;
1250 1251
	case NVME_CTRL_RESETTING:
		return BLK_EH_RESET_TIMER;
1252 1253
	default:
		break;
K
Keith Busch 已提交
1254 1255
	}

1256
	/*
B
Baolin Wang 已提交
1257 1258 1259
	 * Shutdown the controller immediately and schedule a reset if the
	 * command was already aborted once before and still hasn't been
	 * returned to the driver, or if this is the admin queue.
1260
	 */
C
Christoph Hellwig 已提交
1261
	if (!nvmeq->qid || iod->aborted) {
1262
		dev_warn(dev->ctrl.device,
1263 1264
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1265
		nvme_dev_disable(dev, false);
1266
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1267

1268
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1269
		return BLK_EH_DONE;
K
Keith Busch 已提交
1270 1271
	}

1272
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1273
		atomic_inc(&dev->ctrl.abort_limit);
1274
		return BLK_EH_RESET_TIMER;
1275
	}
1276
	iod->aborted = 1;
M
Matias Bjørling 已提交
1277

K
Keith Busch 已提交
1278 1279
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1280
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1281 1282
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1283 1284 1285
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1286 1287

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1288
			BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
1289 1290 1291 1292 1293 1294 1295 1296
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->timeout = ADMIN_TIMEOUT;
	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1297

1298 1299 1300 1301 1302 1303
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1304 1305
}

M
Matias Bjørling 已提交
1306 1307
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1308
	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
1309
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1310 1311
	if (!nvmeq->sq_cmds)
		return;
1312

1313
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1314
		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1315
				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1316
	} else {
1317
		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
1318
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1319
	}
1320 1321
}

1322
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1323 1324 1325
{
	int i;

1326 1327
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1328
		nvme_free_queue(&dev->queues[i]);
1329
	}
1330 1331
}

K
Keith Busch 已提交
1332 1333
/**
 * nvme_suspend_queue - put queue into suspended state
1334
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1335 1336
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1337
{
1338
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1339
		return 1;
1340

1341
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1342
	mb();
1343

1344
	nvmeq->dev->online_queues--;
1345
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1346
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1347 1348
	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
K
Keith Busch 已提交
1349 1350
	return 0;
}
M
Matthew Wilcox 已提交
1351

1352 1353 1354 1355 1356 1357 1358 1359
static void nvme_suspend_io_queues(struct nvme_dev *dev)
{
	int i;

	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
		nvme_suspend_queue(&dev->queues[i]);
}

1360
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1361
{
1362
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1363

1364 1365 1366
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1367
		nvme_disable_ctrl(&dev->ctrl);
1368

1369
	nvme_poll_irqdisable(nvmeq);
M
Matthew Wilcox 已提交
1370 1371
}

1372 1373
/*
 * Called only on a device that has been disabled and after all other threads
1374 1375 1376
 * that can check this device's completion queues have synced, except
 * nvme_poll(). This is the last chance for the driver to see a natural
 * completion before nvme_cancel_request() terminates all incomplete requests.
1377 1378 1379 1380 1381
 */
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
	int i;

1382 1383
	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
		spin_lock(&dev->queues[i].cq_poll_lock);
1384
		nvme_process_cq(&dev->queues[i]);
1385 1386
		spin_unlock(&dev->queues[i].cq_poll_lock);
	}
1387 1388
}

1389 1390 1391 1392
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1393
	unsigned q_size_aligned = roundup(q_depth * entry_size,
1394
					  NVME_CTRL_PAGE_SIZE);
1395 1396

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1397
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1398

1399
		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
1400
		q_depth = div_u64(mem_per_q, entry_size);
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1415
				int qid)
1416
{
1417 1418 1419
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1420
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
1421 1422 1423 1424 1425 1426 1427 1428
		if (nvmeq->sq_cmds) {
			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
							nvmeq->sq_cmds);
			if (nvmeq->sq_dma_addr) {
				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
				return 0;
			}

1429
			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1430
		}
1431
	}
1432

1433
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
1434
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1435 1436
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1437 1438 1439
	return 0;
}

1440
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1441
{
1442
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1443

1444 1445
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1446

1447
	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
1448 1449
	nvmeq->q_depth = depth;
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
1450
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1451 1452 1453
	if (!nvmeq->cqes)
		goto free_nvmeq;

1454
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
M
Matthew Wilcox 已提交
1455 1456
		goto free_cqdma;

M
Matthew Wilcox 已提交
1457
	nvmeq->dev = dev;
1458
	spin_lock_init(&nvmeq->sq_lock);
1459
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1460
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1461
	nvmeq->cq_phase = 1;
1462
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
K
Keith Busch 已提交
1463
	nvmeq->qid = qid;
1464
	dev->ctrl.queue_count++;
1465

1466
	return 0;
M
Matthew Wilcox 已提交
1467 1468

 free_cqdma:
1469 1470
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
M
Matthew Wilcox 已提交
1471
 free_nvmeq:
1472
	return -ENOMEM;
M
Matthew Wilcox 已提交
1473 1474
}

1475
static int queue_request_irq(struct nvme_queue *nvmeq)
1476
{
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1487 1488
}

1489
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1490
{
1491
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1492

1493 1494 1495
	nvmeq->sq_tail = 0;
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1496
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1497
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
1498
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1499
	dev->online_queues++;
1500
	wmb(); /* ensure the first interrupt sees the initialization */
1501 1502
}

J
Jens Axboe 已提交
1503
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1504 1505 1506
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1507
	u16 vector = 0;
1508

1509 1510
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1511 1512 1513 1514
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1515 1516 1517
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
1518
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
J
Jens Axboe 已提交
1519

1520
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1521 1522
	if (result)
		return result;
M
Matthew Wilcox 已提交
1523 1524 1525

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1526
		return result;
1527
	if (result)
M
Matthew Wilcox 已提交
1528 1529
		goto release_cq;

1530
	nvmeq->cq_vector = vector;
1531
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1532

1533
	if (!polled) {
J
Jens Axboe 已提交
1534 1535 1536 1537
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1538

1539
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1540
	return result;
M
Matthew Wilcox 已提交
1541

1542
release_sq:
1543
	dev->online_queues--;
M
Matthew Wilcox 已提交
1544
	adapter_delete_sq(dev, qid);
1545
release_cq:
M
Matthew Wilcox 已提交
1546
	adapter_delete_cq(dev, qid);
1547
	return result;
M
Matthew Wilcox 已提交
1548 1549
}

1550
static const struct blk_mq_ops nvme_mq_admin_ops = {
1551
	.queue_rq	= nvme_queue_rq,
1552
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1553
	.init_hctx	= nvme_admin_init_hctx,
1554
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1555 1556 1557
	.timeout	= nvme_timeout,
};

1558
static const struct blk_mq_ops nvme_mq_ops = {
1559 1560 1561 1562 1563 1564 1565 1566
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1567 1568
};

1569 1570
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1571
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1572 1573 1574 1575 1576
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1577
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1578
		blk_cleanup_queue(dev->ctrl.admin_q);
1579 1580 1581 1582
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1583 1584
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1585
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1586 1587
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1588

K
Keith Busch 已提交
1589
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
M
Matias Bjørling 已提交
1590
		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1591
		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
1592
		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1593
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1594 1595 1596 1597
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1598
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1599

1600 1601
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1602 1603 1604
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1605
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1606
			nvme_dev_remove_admin(dev);
1607
			dev->ctrl.admin_q = NULL;
1608 1609
			return -ENODEV;
		}
K
Keith Busch 已提交
1610
	} else
1611
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1612 1613 1614 1615

	return 0;
}

1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1642
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1643
{
1644
	int result;
M
Matthew Wilcox 已提交
1645 1646 1647
	u32 aqa;
	struct nvme_queue *nvmeq;

1648 1649 1650 1651
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1652
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1653
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1654

1655 1656 1657
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1658

1659
	result = nvme_disable_ctrl(&dev->ctrl);
1660 1661
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1662

1663
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1664 1665
	if (result)
		return result;
M
Matthew Wilcox 已提交
1666

1667 1668
	dev->ctrl.numa_node = dev_to_node(dev->dev);

1669
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1670 1671 1672
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1673 1674 1675
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1676

1677
	result = nvme_enable_ctrl(&dev->ctrl);
1678
	if (result)
K
Keith Busch 已提交
1679
		return result;
M
Matias Bjørling 已提交
1680

K
Keith Busch 已提交
1681
	nvmeq->cq_vector = 0;
1682
	nvme_init_queue(nvmeq, 0);
1683
	result = queue_request_irq(nvmeq);
1684
	if (result) {
1685
		dev->online_queues--;
K
Keith Busch 已提交
1686
		return result;
1687
	}
1688

1689
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1690 1691 1692
	return result;
}

1693
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1694
{
J
Jens Axboe 已提交
1695
	unsigned i, max, rw_queues;
1696
	int ret = 0;
K
Keith Busch 已提交
1697

1698
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1699
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1700
			ret = -ENOMEM;
K
Keith Busch 已提交
1701
			break;
1702 1703
		}
	}
K
Keith Busch 已提交
1704

1705
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1706 1707 1708
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1709 1710 1711 1712
	} else {
		rw_queues = max;
	}

1713
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1714 1715 1716
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1717
		if (ret)
K
Keith Busch 已提交
1718
			break;
M
Matthew Wilcox 已提交
1719
	}
1720 1721 1722

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1723 1724
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1725 1726 1727
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1728 1729
}

1730 1731 1732 1733 1734 1735
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1736
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1737 1738 1739 1740
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1741
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1742
{
1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1753
static void nvme_map_cmb(struct nvme_dev *dev)
1754
{
1755
	u64 size, offset;
1756 1757
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1758
	int bar;
1759

1760 1761 1762
	if (dev->cmb_size)
		return;

1763
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1764 1765
	if (!dev->cmbsz)
		return;
1766
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1767

1768 1769
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1770 1771
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1772 1773

	if (offset > bar_size)
1774
		return;
1775 1776 1777 1778 1779 1780 1781 1782 1783

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1784 1785 1786
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1787
		return;
1788 1789
	}

1790
	dev->cmb_size = size;
1791 1792 1793 1794 1795
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1796 1797 1798 1799 1800

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1801 1802 1803 1804
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1805
	if (dev->cmb_size) {
1806 1807
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1808
		dev->cmb_size = 0;
1809 1810 1811
	}
}

1812 1813
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1814
	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
1815
	u64 dma_addr = dev->host_mem_descs_dma;
1816 1817 1818 1819 1820 1821 1822
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
1823
	c.features.dword12	= cpu_to_le32(host_mem_size);
1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
1843
		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
1844

1845 1846 1847
		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
			       le64_to_cpu(desc->addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1848 1849 1850 1851
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1852 1853 1854
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1855
	dev->host_mem_descs = NULL;
1856
	dev->nr_host_mem_descs = 0;
1857 1858
}

1859 1860
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1861
{
1862
	struct nvme_host_mem_buf_desc *descs;
1863
	u32 max_entries, len;
1864
	dma_addr_t descs_dma;
1865
	int i = 0;
1866
	void **bufs;
1867
	u64 size, tmp;
1868 1869 1870 1871

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1872 1873 1874 1875

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1876 1877
	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
				   &descs_dma, GFP_KERNEL);
1878 1879 1880 1881 1882 1883 1884
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1885
	for (size = 0; size < preferred && i < max_entries; size += len) {
1886 1887
		dma_addr_t dma_addr;

1888
		len = min_t(u64, chunk_size, preferred - size);
1889 1890 1891 1892 1893 1894
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
1895
		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
1896 1897 1898
		i++;
	}

1899
	if (!size)
1900 1901 1902 1903 1904
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1905
	dev->host_mem_descs_dma = descs_dma;
1906 1907 1908 1909 1910
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
1911
		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
1912

1913 1914 1915
		dma_free_attrs(dev->dev, size, bufs[i],
			       le64_to_cpu(descs[i].addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1916 1917 1918 1919
	}

	kfree(bufs);
out_free_descs:
1920 1921
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1922 1923 1924 1925 1926
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1927 1928
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
1929 1930 1931
	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
	u64 chunk_size;
1932 1933

	/* start big and work our way down */
1934
	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
1935 1936 1937 1938 1939 1940 1941 1942 1943 1944
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

1945
static int nvme_setup_host_mem(struct nvme_dev *dev)
1946 1947 1948 1949 1950
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
1951
	int ret;
1952 1953 1954 1955 1956 1957 1958

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
1959
		return 0;
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
1973 1974 1975
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
1976
			return 0; /* controller must work without HMB */
1977 1978 1979 1980 1981
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
1982 1983
	}

1984 1985
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
1986
		nvme_free_host_mem(dev);
1987
	return ret;
K
Keith Busch 已提交
1988 1989
}

1990 1991 1992 1993 1994
/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
 */
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
1995
{
1996
	struct nvme_dev *dev = affd->priv;
1997
	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
1998 1999

	/*
B
Baolin Wang 已提交
2000
	 * If there is no interrupt available for queues, ensure that
2001 2002 2003 2004 2005 2006 2007 2008
	 * the default queue is set to 1. The affinity set size is
	 * also set to one, but the irq core ignores it for this case.
	 *
	 * If only one interrupt is available or 'write_queue' == 0, combine
	 * write and read queues.
	 *
	 * If 'write_queues' > 0, ensure it leaves room for at least one read
	 * queue.
2009
	 */
2010 2011 2012
	if (!nrirqs) {
		nrirqs = 1;
		nr_read_queues = 0;
2013
	} else if (nrirqs == 1 || !nr_write_queues) {
2014
		nr_read_queues = 0;
2015
	} else if (nr_write_queues >= nrirqs) {
2016
		nr_read_queues = 1;
2017
	} else {
2018
		nr_read_queues = nrirqs - nr_write_queues;
2019
	}
2020 2021 2022 2023 2024 2025

	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
	affd->nr_sets = nr_read_queues ? 2 : 1;
2026 2027
}

2028
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2029 2030 2031
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	struct irq_affinity affd = {
2032
		.pre_vectors	= 1,
2033 2034
		.calc_sets	= nvme_calc_irq_sets,
		.priv		= dev,
2035
	};
2036 2037 2038 2039 2040 2041
	unsigned int irq_queues, this_p_queues;

	/*
	 * Poll queues don't need interrupts, but we need at least one IO
	 * queue left over for non-polled IO.
	 */
2042
	this_p_queues = dev->nr_poll_queues;
2043 2044 2045 2046
	if (this_p_queues >= nr_io_queues) {
		this_p_queues = nr_io_queues - 1;
		irq_queues = 1;
	} else {
K
Keith Busch 已提交
2047
		irq_queues = nr_io_queues - this_p_queues + 1;
2048 2049
	}
	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
2050

2051 2052 2053
	/* Initialize for the single interrupt case */
	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
	dev->io_queues[HCTX_TYPE_READ] = 0;
2054

2055 2056 2057 2058 2059 2060 2061
	/*
	 * Some Apple controllers require all queues to use the
	 * first vector.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR)
		irq_queues = 1;

2062 2063
	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2064 2065
}

2066 2067 2068 2069 2070 2071
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}

2072 2073 2074 2075 2076
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
}

2077
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2078
{
2079
	struct nvme_queue *adminq = &dev->queues[0];
2080
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2081
	unsigned int nr_io_queues;
2082
	unsigned long size;
2083
	int result;
M
Matthew Wilcox 已提交
2084

2085 2086 2087 2088 2089 2090
	/*
	 * Sample the module parameters once at reset time so that we have
	 * stable values to work with.
	 */
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
2091 2092 2093 2094 2095 2096 2097

	/*
	 * If tags are shared with admin queue (Apple bug), then
	 * make sure we only use one IO queue.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
		nr_io_queues = 1;
2098 2099 2100
	else
		nr_io_queues = min(nvme_max_io_queues(dev),
				   dev->nr_allocated_queues - 1);
2101

C
Christoph Hellwig 已提交
2102 2103
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2104
		return result;
C
Christoph Hellwig 已提交
2105

2106
	if (nr_io_queues == 0)
2107
		return 0;
2108 2109
	
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2110

2111
	if (dev->cmb_use_sqes) {
2112 2113 2114 2115 2116
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2117
			dev->cmb_use_sqes = false;
2118 2119
	}

2120 2121 2122 2123 2124 2125 2126 2127 2128
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2129

2130
 retry:
K
Keith Busch 已提交
2131
	/* Deregister the admin queue's interrupt */
2132
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2133

2134 2135 2136 2137
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2138
	pci_free_irq_vectors(pdev);
2139 2140

	result = nvme_setup_irqs(dev, nr_io_queues);
2141
	if (result <= 0)
2142
		return -EIO;
2143

2144
	dev->num_vecs = result;
J
Jens Axboe 已提交
2145
	result = max(result - 1, 1);
2146
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2147

2148 2149 2150 2151 2152 2153
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
2154
	result = queue_request_irq(adminq);
2155
	if (result)
K
Keith Busch 已提交
2156
		return result;
2157
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173

	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_disable_io_queues(dev);
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
M
Matthew Wilcox 已提交
2174 2175
}

2176
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2177
{
K
Keith Busch 已提交
2178
	struct nvme_queue *nvmeq = req->end_io_data;
2179

K
Keith Busch 已提交
2180
	blk_mq_free_request(req);
2181
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2182 2183
}

2184
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2185
{
K
Keith Busch 已提交
2186
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2187

2188 2189
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2190 2191

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2192 2193
}

K
Keith Busch 已提交
2194
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2195
{
K
Keith Busch 已提交
2196 2197 2198
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2199

K
Keith Busch 已提交
2200 2201 2202
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2203

2204
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
K
Keith Busch 已提交
2205 2206
	if (IS_ERR(req))
		return PTR_ERR(req);
2207

K
Keith Busch 已提交
2208 2209 2210
	req->timeout = ADMIN_TIMEOUT;
	req->end_io_data = nvmeq;

2211
	init_completion(&nvmeq->delete_done);
K
Keith Busch 已提交
2212 2213 2214 2215
	blk_execute_rq_nowait(q, NULL, req, false,
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2216 2217
}

2218
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2219
{
2220
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2221
	unsigned long timeout;
K
Keith Busch 已提交
2222

K
Keith Busch 已提交
2223
 retry:
2224 2225 2226 2227 2228 2229
	timeout = ADMIN_TIMEOUT;
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2230
	}
2231 2232 2233 2234
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2235 2236 2237
				timeout);
		if (timeout == 0)
			return false;
2238 2239

		sent--;
2240 2241 2242 2243
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2244 2245
}

K
Keith Busch 已提交
2246
static void nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2247
{
2248 2249
	int ret;

2250
	if (!dev->ctrl.tagset) {
2251
		dev->tagset.ops = &nvme_mq_ops;
2252
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2253
		dev->tagset.nr_maps = 2; /* default + read */
2254 2255
		if (dev->io_queues[HCTX_TYPE_POLL])
			dev->tagset.nr_maps++;
2256
		dev->tagset.timeout = NVME_IO_TIMEOUT;
2257
		dev->tagset.numa_node = dev->ctrl.numa_node;
2258 2259
		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
						BLK_MQ_MAX_DEPTH) - 1;
2260
		dev->tagset.cmd_size = sizeof(struct nvme_iod);
2261 2262
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2263

2264 2265 2266 2267 2268 2269 2270 2271
		/*
		 * Some Apple controllers requires tags to be unique
		 * across admin and IO queue, so reserve the first 32
		 * tags of the IO queue.
		 */
		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
			dev->tagset.reserved_tags = NVME_AQ_DEPTH;

2272 2273 2274 2275
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
K
Keith Busch 已提交
2276
			return;
2277
		}
2278
		dev->ctrl.tagset = &dev->tagset;
2279 2280 2281 2282 2283
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2284
	}
2285

2286
	nvme_dbbuf_set(dev);
M
Matthew Wilcox 已提交
2287 2288
}

2289
static int nvme_pci_enable(struct nvme_dev *dev)
2290
{
2291
	int result = -ENOMEM;
2292
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2293 2294 2295 2296 2297 2298

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2299
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
2300
		goto disable;
2301

2302
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2303
		result = -ENODEV;
2304
		goto disable;
K
Keith Busch 已提交
2305
	}
2306 2307

	/*
2308 2309 2310
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2311
	 */
2312 2313 2314
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2315

2316
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2317

2318
	dev->q_depth = min_t(u16, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2319
				io_queue_depth);
2320
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
2321
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2322
	dev->dbs = dev->bar + 4096;
2323

2324 2325 2326 2327 2328 2329 2330 2331 2332
	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;
2333 2334 2335 2336 2337 2338 2339

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2340 2341
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2342
			dev->q_depth);
2343 2344
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2345
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2346 2347 2348
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2349 2350
	}

2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362
	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}


2363
	nvme_map_cmb(dev);
2364

K
Keith Busch 已提交
2365 2366
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2367 2368 2369 2370 2371 2372 2373 2374
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2375 2376 2377
{
	if (dev->bar)
		iounmap(dev->bar);
2378
	pci_release_mem_regions(to_pci_dev(dev->dev));
2379 2380 2381
}

static void nvme_pci_disable(struct nvme_dev *dev)
2382
{
2383 2384
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2385
	pci_free_irq_vectors(pdev);
2386

K
Keith Busch 已提交
2387 2388
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2389
		pci_disable_device(pdev);
K
Keith Busch 已提交
2390 2391 2392
	}
}

2393
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2394
{
2395
	bool dead = true, freeze = false;
K
Keith Busch 已提交
2396
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2397

2398
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2399 2400 2401
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2402
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
2403 2404
		    dev->ctrl.state == NVME_CTRL_RESETTING) {
			freeze = true;
K
Keith Busch 已提交
2405
			nvme_start_freeze(&dev->ctrl);
2406
		}
K
Keith Busch 已提交
2407 2408
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2409
	}
2410

K
Keith Busch 已提交
2411 2412 2413 2414
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2415 2416
	if (!dead && shutdown && freeze)
		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2417 2418

	nvme_stop_queues(&dev->ctrl);
2419

2420
	if (!dead && dev->ctrl.queue_count > 0) {
2421
		nvme_disable_io_queues(dev);
2422
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2423
	}
2424 2425
	nvme_suspend_io_queues(dev);
	nvme_suspend_queue(&dev->queues[0]);
2426
	nvme_pci_disable(dev);
2427
	nvme_reap_pending_cqes(dev);
2428

2429 2430
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
2431 2432
	blk_mq_tagset_wait_completed_request(&dev->tagset);
	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
K
Keith Busch 已提交
2433 2434 2435 2436 2437 2438

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
2439
	if (shutdown) {
K
Keith Busch 已提交
2440
		nvme_start_queues(&dev->ctrl);
2441 2442 2443
		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
	}
2444
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2445 2446
}

2447 2448 2449 2450 2451 2452 2453 2454
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
	if (!nvme_wait_reset(&dev->ctrl))
		return -EBUSY;
	nvme_dev_disable(dev, shutdown);
	return 0;
}

M
Matthew Wilcox 已提交
2455 2456
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2457
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
M
Matthew Wilcox 已提交
2458 2459 2460 2461
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

2462
	/* Optimisation for I/Os between 4k and 128k */
2463
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2464 2465 2466 2467 2468
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2469 2470 2471 2472 2473 2474
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2475
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2476 2477
}

2478 2479 2480 2481 2482 2483 2484
static void nvme_free_tagset(struct nvme_dev *dev)
{
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	dev->ctrl.tagset = NULL;
}

2485
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2486
{
2487
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2488

2489
	nvme_dbbuf_dma_free(dev);
2490
	nvme_free_tagset(dev);
2491 2492
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2493
	free_opal_dev(dev->ctrl.opal_dev);
2494
	mempool_destroy(dev->iod_mempool);
2495 2496
	put_device(dev->dev);
	kfree(dev->queues);
2497 2498 2499
	kfree(dev);
}

2500
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2501
{
2502 2503 2504 2505 2506
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
2507
	nvme_get_ctrl(&dev->ctrl);
2508
	nvme_dev_disable(dev, false);
2509
	nvme_kill_queues(&dev->ctrl);
2510
	if (!queue_work(nvme_wq, &dev->remove_work))
2511 2512 2513
		nvme_put_ctrl(&dev->ctrl);
}

2514
static void nvme_reset_work(struct work_struct *work)
2515
{
2516 2517
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2518
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2519
	int result;
2520

2521 2522
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
		result = -ENODEV;
2523
		goto out;
2524
	}
2525

2526 2527 2528 2529
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2530
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2531
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2532
	nvme_sync_queues(&dev->ctrl);
2533

2534
	mutex_lock(&dev->shutdown_lock);
2535
	result = nvme_pci_enable(dev);
2536
	if (result)
2537
		goto out_unlock;
2538

2539
	result = nvme_pci_configure_admin_queue(dev);
2540
	if (result)
2541
		goto out_unlock;
2542

K
Keith Busch 已提交
2543 2544
	result = nvme_alloc_admin_tags(dev);
	if (result)
2545
		goto out_unlock;
2546

2547 2548 2549 2550
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
2551 2552
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
2553
	dev->ctrl.max_segments = NVME_MAX_SEGS;
2554 2555 2556 2557 2558 2559

	/*
	 * Don't limit the IOMMU merged segment size.
	 */
	dma_set_max_seg_size(dev->dev, 0xffffffff);

2560 2561 2562 2563 2564 2565 2566 2567 2568
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
2569
		result = -EBUSY;
2570 2571
		goto out;
	}
2572

2573 2574 2575 2576 2577 2578
	/*
	 * We do not support an SGL for metadata (yet), so we are limited to a
	 * single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;

2579 2580
	result = nvme_init_identify(&dev->ctrl);
	if (result)
2581
		goto out;
2582

2583 2584 2585 2586 2587 2588 2589 2590 2591
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2592
	}
2593

2594 2595 2596 2597 2598 2599 2600
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2601 2602 2603 2604 2605
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2606

2607
	result = nvme_setup_io_queues(dev);
2608
	if (result)
2609
		goto out;
2610

2611 2612 2613 2614
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2615
	if (dev->online_queues < 2) {
2616
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2617
		nvme_kill_queues(&dev->ctrl);
2618
		nvme_remove_namespaces(&dev->ctrl);
2619
		nvme_free_tagset(dev);
2620
	} else {
2621
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2622
		nvme_wait_freeze(&dev->ctrl);
K
Keith Busch 已提交
2623
		nvme_dev_add(dev);
K
Keith Busch 已提交
2624
		nvme_unfreeze(&dev->ctrl);
2625 2626
	}

2627 2628 2629 2630
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
K
Keith Busch 已提交
2631
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
2632
		dev_warn(dev->ctrl.device,
K
Keith Busch 已提交
2633
			"failed to mark controller live state\n");
2634
		result = -ENODEV;
2635 2636
		goto out;
	}
2637

2638
	nvme_start_ctrl(&dev->ctrl);
2639
	return;
2640

2641 2642
 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
2643
 out:
2644 2645 2646 2647
	if (result)
		dev_warn(dev->ctrl.device,
			 "Removing after probe failure status: %d\n", result);
	nvme_remove_dead_ctrl(dev);
2648 2649
}

2650
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2651
{
2652
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2653
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2654 2655

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2656
		device_release_driver(&pdev->dev);
2657
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2658 2659
}

2660
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2661
{
2662
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2663
	return 0;
T
Tejun Heo 已提交
2664 2665
}

2666
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2667
{
2668 2669 2670
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2671

2672 2673
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
2674
	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
2675
	return 0;
2676 2677
}

2678 2679 2680 2681
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

2682
	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
2683 2684
}

2685
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2686
	.name			= "pcie",
2687
	.module			= THIS_MODULE,
2688 2689
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2690
	.reg_read32		= nvme_pci_reg_read32,
2691
	.reg_write32		= nvme_pci_reg_write32,
2692
	.reg_read64		= nvme_pci_reg_read64,
2693
	.free_ctrl		= nvme_pci_free_ctrl,
2694
	.submit_async_event	= nvme_pci_submit_async_event,
2695
	.get_address		= nvme_pci_get_address,
2696
};
2697

2698 2699 2700 2701
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2702
	if (pci_request_mem_regions(pdev, "nvme"))
2703 2704
		return -ENODEV;

2705
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2706 2707
		goto release;

M
Max Gurtovoy 已提交
2708
	return 0;
2709
  release:
M
Max Gurtovoy 已提交
2710 2711
	pci_release_mem_regions(pdev);
	return -ENODEV;
2712 2713
}

2714
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2729 2730 2731
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2732 2733 2734
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2735 2736
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2737 2738
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2739
			return NVME_QUIRK_NO_APST;
2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751
	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
		/*
		 * Forcing to use host managed nvme power settings for
		 * lowest idle power with quick resume latency on
		 * Samsung and Toshiba SSDs based on suspend behavior
		 * on Coffee Lake board for LENOVO C640
		 */
		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
			return NVME_QUIRK_SIMPLE_SUSPEND;
2752 2753 2754 2755 2756
	}

	return 0;
}

2757 2758 2759
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2760

2761
	flush_work(&dev->ctrl.reset_work);
2762
	flush_work(&dev->ctrl.scan_work);
2763
	nvme_put_ctrl(&dev->ctrl);
2764 2765
}

2766
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2767
{
M
Matias Bjørling 已提交
2768
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2769
	struct nvme_dev *dev;
2770
	unsigned long quirks = id->driver_data;
2771
	size_t alloc_size;
M
Matthew Wilcox 已提交
2772

M
Matias Bjørling 已提交
2773 2774
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2775
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2776 2777

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2778 2779
	if (!dev)
		return -ENOMEM;
2780

2781 2782 2783 2784 2785
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
	dev->queues = kcalloc_node(dev->nr_allocated_queues,
			sizeof(struct nvme_queue), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2786 2787 2788
	if (!dev->queues)
		goto free;

2789
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2790
	pci_set_drvdata(pdev, dev);
2791

2792 2793
	result = nvme_dev_map(dev);
	if (result)
2794
		goto put_pci;
2795

2796
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2797
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2798
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2799

M
Matthew Wilcox 已提交
2800 2801
	result = nvme_setup_prp_pools(dev);
	if (result)
2802
		goto unmap;
2803

2804
	quirks |= check_vendor_combination_bug(pdev);
2805

2806 2807 2808 2809
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
2810
	alloc_size = nvme_pci_iod_alloc_size();
2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2822 2823 2824 2825 2826
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2827 2828
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2829
	nvme_reset_ctrl(&dev->ctrl);
2830
	async_schedule(nvme_async_probe, dev);
2831

M
Matthew Wilcox 已提交
2832 2833
	return 0;

2834 2835
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2836
 release_pools:
M
Matthew Wilcox 已提交
2837
	nvme_release_prp_pools(dev);
2838 2839
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2840
 put_pci:
2841
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2842 2843 2844 2845 2846 2847
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2848
static void nvme_reset_prepare(struct pci_dev *pdev)
2849
{
K
Keith Busch 已提交
2850
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2851 2852 2853 2854 2855 2856 2857 2858

	/*
	 * We don't need to check the return value from waiting for the reset
	 * state as pci_dev device lock is held, making it impossible to race
	 * with ->remove().
	 */
	nvme_disable_prepare_reset(dev, false);
	nvme_sync_queues(&dev->ctrl);
2859
}
2860

2861 2862
static void nvme_reset_done(struct pci_dev *pdev)
{
2863
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2864 2865 2866

	if (!nvme_try_sched_reset(&dev->ctrl))
		flush_work(&dev->ctrl.reset_work);
2867 2868
}

2869 2870 2871
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2872

2873
	nvme_disable_prepare_reset(dev, true);
2874 2875
}

2876 2877 2878 2879 2880
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
2881
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2882 2883
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2884

2885
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
2886
	pci_set_drvdata(pdev, NULL);
2887

2888
	if (!pci_device_is_present(pdev)) {
2889
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2890
		nvme_dev_disable(dev, true);
2891
		nvme_dev_remove_admin(dev);
2892
	}
2893

2894
	flush_work(&dev->ctrl.reset_work);
2895 2896
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
2897
	nvme_dev_disable(dev, true);
2898
	nvme_release_cmb(dev);
2899
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
2900
	nvme_dev_remove_admin(dev);
2901
	nvme_free_queues(dev, 0);
K
Keith Busch 已提交
2902
	nvme_release_prp_pools(dev);
2903
	nvme_dev_unmap(dev);
2904
	nvme_uninit_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2905 2906
}

2907
#ifdef CONFIG_PM_SLEEP
2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922
static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
}

static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
{
	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}

static int nvme_resume(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
	struct nvme_ctrl *ctrl = &ndev->ctrl;

2923
	if (ndev->last_ps == U32_MAX ||
2924
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
2925
		return nvme_try_sched_reset(&ndev->ctrl);
2926 2927 2928
	return 0;
}

2929 2930 2931 2932
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);
2933 2934 2935
	struct nvme_ctrl *ctrl = &ndev->ctrl;
	int ret = -EBUSY;

2936 2937
	ndev->last_ps = U32_MAX;

2938 2939 2940 2941 2942 2943 2944
	/*
	 * The platform does not remove power for a kernel managed suspend so
	 * use host managed nvme power settings for lowest idle power if
	 * possible. This should have quicker resume latency than a full device
	 * shutdown.  But if the firmware is involved after the suspend or the
	 * device does not support any non-default power states, shut down the
	 * device fully.
2945 2946 2947 2948 2949
	 *
	 * If ASPM is not enabled for the device, shut down the device and allow
	 * the PCI bus layer to put it into D3 in order to take the PCIe link
	 * down, so as to allow the platform to achieve its minimum low-power
	 * state (which may not be possible if the link is up).
2950 2951 2952 2953 2954
	 *
	 * If a host memory buffer is enabled, shut down the device as the NVMe
	 * specification allows the device to access the host memory buffer in
	 * host DRAM from all power states, but hosts will fail access to DRAM
	 * during S3.
2955
	 */
2956
	if (pm_suspend_via_firmware() || !ctrl->npss ||
2957
	    !pcie_aspm_enabled(pdev) ||
2958
	    ndev->nr_host_mem_descs ||
2959 2960
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);
2961 2962 2963 2964 2965

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

K
Keith Busch 已提交
2966
	if (ctrl->state != NVME_CTRL_LIVE)
2967 2968 2969 2970 2971 2972
		goto unfreeze;

	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;

2973 2974 2975 2976 2977 2978 2979
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);

2980 2981 2982 2983 2984
	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
2985 2986 2987
		/* discard the saved state */
		pci_load_saved_state(pdev, NULL);

2988 2989
		/*
		 * Clearing npss forces a controller reset on resume. The
2990
		 * correct value will be rediscovered then.
2991
		 */
2992
		ret = nvme_disable_prepare_reset(ndev, true);
2993 2994 2995 2996 2997 2998 2999 3000 3001 3002
		ctrl->npss = 0;
	}
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
}

static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3003

3004
	return nvme_disable_prepare_reset(ndev, true);
3005 3006
}

3007
static int nvme_simple_resume(struct device *dev)
3008 3009 3010 3011
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

3012
	return nvme_try_sched_reset(&ndev->ctrl);
3013 3014
}

3015
static const struct dev_pm_ops nvme_dev_pm_ops = {
3016 3017 3018 3019 3020 3021 3022 3023
	.suspend	= nvme_suspend,
	.resume		= nvme_resume,
	.freeze		= nvme_simple_suspend,
	.thaw		= nvme_simple_resume,
	.poweroff	= nvme_simple_suspend,
	.restore	= nvme_simple_resume,
};
#endif /* CONFIG_PM_SLEEP */
M
Matthew Wilcox 已提交
3024

K
Keith Busch 已提交
3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
3039 3040
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
3041
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
3042 3043
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
3044 3045
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
3046 3047 3048 3049 3050 3051 3052 3053 3054
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

3055
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
3056
	pci_restore_state(pdev);
3057
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
3058 3059 3060 3061 3062
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
3063 3064 3065
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
3066 3067
}

3068
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
3069 3070 3071
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
3072 3073
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
3074 3075
};

3076
static const struct pci_device_id nvme_id_table[] = {
3077
	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
3078
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3079
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3080
	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
3081
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3082
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3083
	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
3084
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3085
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3086
	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
3087 3088
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3089
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
3090
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3091 3092
				NVME_QUIRK_MEDIUM_PRIO_SQ |
				NVME_QUIRK_NO_TEMP_THRESH_CHANGE },
3093 3094
	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3095
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
3096 3097
		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3098 3099
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3100 3101
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3102 3103
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3104 3105
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3106 3107 3108 3109
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
C
Christoph Hellwig 已提交
3110 3111 3112 3113
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
3114 3115
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
3116 3117
	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3118 3119 3120
	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
M
Matthew Wilcox 已提交
3121
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
3122 3123
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
3124
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
3125 3126
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
3127 3128
				NVME_QUIRK_128_BYTES_SQES |
				NVME_QUIRK_SHARED_TAGS },
M
Matthew Wilcox 已提交
3129 3130 3131 3132 3133 3134 3135 3136
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
3137
	.remove		= nvme_remove,
3138
	.shutdown	= nvme_shutdown,
3139
#ifdef CONFIG_PM_SLEEP
3140 3141 3142
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
3143
#endif
3144
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
3145 3146 3147 3148 3149
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
3150 3151 3152
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3153
	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3154

3155
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
3156 3157 3158 3159 3160
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3161
	flush_workqueue(nvme_wq);
M
Matthew Wilcox 已提交
3162 3163 3164 3165
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3166
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3167 3168
module_init(nvme_init);
module_exit(nvme_exit);