pci.c 85.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
M
Matthew Wilcox 已提交
2 3
/*
 * NVM Express device driver
4
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
5 6
 */

7
#include <linux/acpi.h>
K
Keith Busch 已提交
8
#include <linux/aer.h>
9
#include <linux/async.h>
M
Matthew Wilcox 已提交
10
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
11
#include <linux/blk-mq.h>
12
#include <linux/blk-mq-pci.h>
13
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
14 15 16 17 18
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
19
#include <linux/mutex.h>
20
#include <linux/once.h>
M
Matthew Wilcox 已提交
21
#include <linux/pci.h>
22
#include <linux/suspend.h>
K
Keith Busch 已提交
23
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
24
#include <linux/types.h>
25
#include <linux/io-64-nonatomic-lo-hi.h>
26
#include <linux/io-64-nonatomic-hi-lo.h>
27
#include <linux/sed-opal.h>
28
#include <linux/pci-p2pdma.h>
29

Y
yupeng 已提交
30
#include "trace.h"
31 32
#include "nvme.h"

33
#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
34
#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
35

C
Chaitanya Kulkarni 已提交
36
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
37

38 39 40 41 42 43 44
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

45 46 47
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

48
static bool use_cmb_sqes = true;
49
module_param(use_cmb_sqes, bool, 0444);
50 51
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

52 53 54 55
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
56

C
Chaitanya Kulkarni 已提交
57 58 59 60 61 62
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

63 64 65
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
66
	.get = param_get_uint,
67 68
};

69
static unsigned int io_queue_depth = 1024;
70 71 72
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
	unsigned int n;
	int ret;

	ret = kstrtouint(val, 10, &n);
	if (ret != 0 || n > num_possible_cpus())
		return -EINVAL;
	return param_set_uint(val, kp);
}

static const struct kernel_param_ops io_queue_count_ops = {
	.set = io_queue_count_set,
	.get = param_get_uint,
};

89
static unsigned int write_queues;
90
module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
91 92 93 94
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

95
static unsigned int poll_queues;
96
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
J
Jens Axboe 已提交
97 98
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

99 100 101 102
static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");

103 104
struct nvme_dev;
struct nvme_queue;
105

106
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
107
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
108

109 110 111 112
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
113
	struct nvme_queue *queues;
114 115 116 117 118 119 120 121
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
122
	unsigned io_queues[HCTX_MAX_TYPES];
123
	unsigned int num_vecs;
124
	u32 q_depth;
125
	int io_sqes;
126 127
	u32 db_stride;
	void __iomem *bar;
128
	unsigned long bar_mapped_size;
129
	struct work_struct remove_work;
130
	struct mutex shutdown_lock;
131 132
	bool subsystem;
	u64 cmb_size;
133
	bool cmb_use_sqes;
134
	u32 cmbsz;
135
	u32 cmbloc;
136
	struct nvme_ctrl ctrl;
137
	u32 last_ps;
138

139 140
	mempool_t *iod_mempool;

141
	/* shadow doorbell buffer support: */
142 143 144 145
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
146 147 148 149

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
150
	dma_addr_t host_mem_descs_dma;
151 152
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
153 154 155
	unsigned int nr_allocated_queues;
	unsigned int nr_write_queues;
	unsigned int nr_poll_queues;
K
Keith Busch 已提交
156
};
157

158 159
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
160
	int ret;
161
	u32 n;
162

163
	ret = kstrtou32(val, 10, &n);
164 165 166
	if (ret != 0 || n < 2)
		return -EINVAL;

167
	return param_set_uint(val, kp);
168 169
}

170 171 172 173 174 175 176 177 178 179
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

180 181 182 183 184
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
185 186 187 188 189
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
M
Matthew Wilcox 已提交
190
	struct nvme_dev *dev;
191
	spinlock_t sq_lock;
192
	void *sq_cmds;
193 194
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
K
Keith Busch 已提交
195
	struct nvme_completion *cqes;
M
Matthew Wilcox 已提交
196 197 198
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
199
	u32 q_depth;
200
	u16 cq_vector;
M
Matthew Wilcox 已提交
201
	u16 sq_tail;
202
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
203
	u16 cq_head;
K
Keith Busch 已提交
204
	u16 qid;
205
	u8 cq_phase;
206
	u8 sqes;
207 208
	unsigned long flags;
#define NVMEQ_ENABLED		0
209
#define NVMEQ_SQ_CMB		1
210
#define NVMEQ_DELETE_ERROR	2
211
#define NVMEQ_POLLED		3
212 213 214 215
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
216
	struct completion delete_done;
M
Matthew Wilcox 已提交
217 218
};

219
/*
220 221 222 223
 * The nvme_iod describes the data in an I/O.
 *
 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
 * to the actual struct scatterlist.
224 225
 */
struct nvme_iod {
226
	struct nvme_request req;
227
	struct nvme_command cmd;
C
Christoph Hellwig 已提交
228
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
229
	bool use_sgl;
C
Christoph Hellwig 已提交
230
	int aborted;
231 232 233
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	dma_addr_t first_dma;
234
	unsigned int dma_len;	/* length of single DMA segment mapping */
235
	dma_addr_t meta_dma;
C
Christoph Hellwig 已提交
236
	struct scatterlist *sg;
M
Matthew Wilcox 已提交
237 238
};

239
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
240
{
241
	return dev->nr_allocated_queues * 8 * dev->db_stride;
242 243 244 245
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
246
	unsigned int mem_size = nvme_dbbuf_size(dev);
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
271
	unsigned int mem_size = nvme_dbbuf_size(dev);
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

297 298 299 300 301 302 303 304 305 306 307
static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return;

	nvmeq->dbbuf_sq_db = NULL;
	nvmeq->dbbuf_cq_db = NULL;
	nvmeq->dbbuf_sq_ei = NULL;
	nvmeq->dbbuf_cq_ei = NULL;
}

308 309 310
static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;
311
	unsigned int i;
312 313 314 315 316 317 318 319 320 321

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
322
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
323 324
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
325 326 327

		for (i = 1; i <= dev->online_queues; i++)
			nvme_dbbuf_free(&dev->queues[i]);
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

352 353 354 355 356 357 358 359
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

360 361 362 363 364
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
365 366
}

367 368 369 370 371
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
372
static int nvme_pci_npages_prp(void)
373
{
374
	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
375
				      NVME_CTRL_PAGE_SIZE);
376 377 378
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
379 380 381 382
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
383
static int nvme_pci_npages_sgl(void)
384
{
385 386
	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
			PAGE_SIZE);
C
Christoph Hellwig 已提交
387
}
388

389
static size_t nvme_pci_iod_alloc_size(void)
C
Christoph Hellwig 已提交
390
{
391
	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
C
Chaitanya Kulkarni 已提交
392

393 394
	return sizeof(__le64 *) * npages +
		sizeof(struct scatterlist) * NVME_MAX_SEGS;
C
Christoph Hellwig 已提交
395
}
396

M
Matias Bjørling 已提交
397 398
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
399
{
M
Matias Bjørling 已提交
400
	struct nvme_dev *dev = data;
401
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
402

403 404 405
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);

M
Matias Bjørling 已提交
406 407
	hctx->driver_data = nvmeq;
	return 0;
408 409
}

M
Matias Bjørling 已提交
410 411
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
412
{
M
Matias Bjørling 已提交
413
	struct nvme_dev *dev = data;
414
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
415

416
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
417 418
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
419 420
}

421 422
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
423
{
424
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
425
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
426
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
427
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
428 429

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
430
	iod->nvmeq = nvmeq;
431 432

	nvme_req(req)->ctrl = &dev->ctrl;
433
	nvme_req(req)->cmd = &iod->cmd;
M
Matias Bjørling 已提交
434 435 436
	return 0;
}

437 438 439 440 441 442 443 444 445
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

446 447 448
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
449 450 451 452 453 454 455 456
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
457
			BUG_ON(i == HCTX_TYPE_DEFAULT);
458
			continue;
459 460
		}

J
Jens Axboe 已提交
461 462 463 464
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
465
		map->queue_offset = qoff;
466
		if (i != HCTX_TYPE_POLL && offset)
J
Jens Axboe 已提交
467 468 469
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
470 471 472 473 474
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
475 476
}

477 478 479 480
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
481
{
482 483 484 485 486 487 488 489 490
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

491 492 493
	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
494
	nvmeq->last_sq_tail = nvmeq->sq_tail;
495 496
}

M
Matthew Wilcox 已提交
497
/**
498
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
499 500
 * @nvmeq: The queue to use
 * @cmd: The command to send
501
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
502
 */
503 504
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
505
{
506
	spin_lock(&nvmeq->sq_lock);
507 508
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
509 510
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
511
	nvme_write_sq_db(nvmeq, write_sq);
512 513 514 515 516 517 518 519
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
520 521
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
522
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
523 524
}

C
Chaitanya Kulkarni 已提交
525
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
526
{
C
Christoph Hellwig 已提交
527
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
528
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
529 530
}

531 532 533
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
534
	int nseg = blk_rq_nr_phys_segments(req);
535 536
	unsigned int avg_seg_size;

537
	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
538 539 540 541 542 543 544 545 546 547

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

548
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
549
{
550
	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
551 552
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	dma_addr_t dma_addr = iod->first_dma;
553 554
	int i;

555 556 557 558 559 560
	for (i = 0; i < iod->npages; i++) {
		__le64 *prp_list = nvme_pci_iod_list(req)[i];
		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);

		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
		dma_addr = next_dma_addr;
561 562
	}

563
}
564

565 566 567 568 569 570
static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
{
	const int last_sg = SGES_PER_PAGE - 1;
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	dma_addr_t dma_addr = iod->first_dma;
	int i;
571

572 573 574
	for (i = 0; i < iod->npages; i++) {
		struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
		dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
575

576 577 578
		dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
		dma_addr = next_dma_addr;
	}
C
Chaitanya Kulkarni 已提交
579

580
}
C
Chaitanya Kulkarni 已提交
581

582 583 584
static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
585

586 587 588 589 590 591
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
				    rq_dma_dir(req));
	else
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
}
C
Chaitanya Kulkarni 已提交
592

593 594 595
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
596

597 598 599 600
	if (iod->dma_len) {
		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
			       rq_dma_dir(req));
		return;
601
	}
602

603 604 605 606 607 608 609 610 611 612
	WARN_ON_ONCE(!iod->nents);

	nvme_unmap_sg(dev, req);
	if (iod->npages == 0)
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			      iod->first_dma);
	else if (iod->use_sgl)
		nvme_free_sgls(dev, req);
	else
		nvme_free_prps(dev, req);
613
	mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
614 615
}

616 617 618 619 620 621 622 623 624 625 626 627 628 629
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
630 631
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
632
{
C
Christoph Hellwig 已提交
633
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
634
	struct dma_pool *pool;
635
	int length = blk_rq_payload_bytes(req);
636
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
637 638
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
639
	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
640
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
641
	void **list = nvme_pci_iod_list(req);
642
	dma_addr_t prp_dma;
643
	int nprps, i;
M
Matthew Wilcox 已提交
644

645
	length -= (NVME_CTRL_PAGE_SIZE - offset);
646 647
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
648
		goto done;
649
	}
M
Matthew Wilcox 已提交
650

651
	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
652
	if (dma_len) {
653
		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
654 655 656 657 658 659
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

660
	if (length <= NVME_CTRL_PAGE_SIZE) {
661
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
662
		goto done;
663 664
	}

665
	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
666 667
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
668
		iod->npages = 0;
669 670
	} else {
		pool = dev->prp_page_pool;
671
		iod->npages = 1;
672 673
	}

674
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
675
	if (!prp_list) {
676
		iod->first_dma = dma_addr;
677
		iod->npages = -1;
678
		return BLK_STS_RESOURCE;
679
	}
680 681
	list[0] = prp_list;
	iod->first_dma = prp_dma;
682 683
	i = 0;
	for (;;) {
684
		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
685
			__le64 *old_prp_list = prp_list;
686
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
687
			if (!prp_list)
688
				goto free_prps;
689
			list[iod->npages++] = prp_list;
690 691 692
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
693 694
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
695 696 697
		dma_len -= NVME_CTRL_PAGE_SIZE;
		dma_addr += NVME_CTRL_PAGE_SIZE;
		length -= NVME_CTRL_PAGE_SIZE;
698 699 700 701
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
702 703
		if (unlikely(dma_len < 0))
			goto bad_sgl;
704 705 706
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
707
	}
C
Chaitanya Kulkarni 已提交
708 709 710
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
711
	return BLK_STS_OK;
712 713 714 715
free_prps:
	nvme_free_prps(dev, req);
	return BLK_STS_RESOURCE;
bad_sgl:
716 717 718
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
719
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
720 721
}

C
Chaitanya Kulkarni 已提交
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
744
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
745 746 747 748 749 750
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
751
	int i = 0;
C
Chaitanya Kulkarni 已提交
752 753 754 755

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

756
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
787
				goto free_sgls;
C
Chaitanya Kulkarni 已提交
788 789 790 791 792 793 794 795 796

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
797
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
798 799

	return BLK_STS_OK;
800 801 802
free_sgls:
	nvme_free_sgls(dev, req);
	return BLK_STS_RESOURCE;
C
Chaitanya Kulkarni 已提交
803 804
}

805 806 807 808 809
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
810 811
	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
812 813 814 815 816 817 818 819 820

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
	if (bv->bv_len > first_prp_len)
		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
821
	return BLK_STS_OK;
822 823
}

824 825 826 827 828 829 830 831 832 833 834
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

835
	cmnd->flags = NVME_CMD_SGL_METABUF;
836 837 838
	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
839
	return BLK_STS_OK;
840 841
}

842
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
843
		struct nvme_command *cmnd)
844
{
C
Christoph Hellwig 已提交
845
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
846
	blk_status_t ret = BLK_STS_RESOURCE;
847
	int nr_mapped;
848

849 850 851 852
	if (blk_rq_nr_phys_segments(req) == 1) {
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
853
			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
854 855
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
856

857
			if (iod->nvmeq->qid && sgl_threshold &&
858 859 860
			    dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
				return nvme_setup_sgl_simple(dev, req,
							     &cmnd->rw, &bv);
861 862 863 864
		}
	}

	iod->dma_len = 0;
865 866 867
	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
	if (!iod->sg)
		return BLK_STS_RESOURCE;
868
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
869
	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
C
Christoph Hellwig 已提交
870
	if (!iod->nents)
871
		goto out_free_sg;
872

873
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
874 875
		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
876 877
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
878
					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
879
	if (!nr_mapped)
880
		goto out_free_sg;
881

882
	iod->use_sgl = nvme_pci_use_sgls(dev, req);
883
	if (iod->use_sgl)
884
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
885 886
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
887
	if (ret != BLK_STS_OK)
888 889 890 891 892 893 894
		goto out_unmap_sg;
	return BLK_STS_OK;

out_unmap_sg:
	nvme_unmap_sg(dev, req);
out_free_sg:
	mempool_free(iod->sg, dev->iod_mempool);
895 896
	return ret;
}
897

898 899 900 901
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
902

903 904 905 906 907
	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
			rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
		return BLK_STS_IOERR;
	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
908
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
909 910
}

911 912 913
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
914
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
915
			 const struct blk_mq_queue_data *bd)
916
{
M
Matias Bjørling 已提交
917 918
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
919
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
920
	struct request *req = bd->rq;
921
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
922
	struct nvme_command *cmnd = &iod->cmd;
923
	blk_status_t ret;
K
Keith Busch 已提交
924

925 926 927 928
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;

929 930 931 932
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
933
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
934 935
		return BLK_STS_IOERR;

936 937 938
	if (!nvme_check_ready(&dev->ctrl, req, true))
		return nvme_fail_nonready_command(&dev->ctrl, req);

939
	ret = nvme_setup_cmd(ns, req);
940
	if (ret)
C
Christoph Hellwig 已提交
941
		return ret;
M
Matias Bjørling 已提交
942

943
	if (blk_rq_nr_phys_segments(req)) {
944
		ret = nvme_map_data(dev, req, cmnd);
945
		if (ret)
946
			goto out_free_cmd;
947
	}
M
Matias Bjørling 已提交
948

949
	if (blk_integrity_rq(req)) {
950
		ret = nvme_map_metadata(dev, req, cmnd);
951 952 953 954
		if (ret)
			goto out_unmap_data;
	}

955
	blk_mq_start_request(req);
956
	nvme_submit_cmd(nvmeq, cmnd, bd->last);
957
	return BLK_STS_OK;
958 959
out_unmap_data:
	nvme_unmap_data(dev, req);
960 961
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
962
	return ret;
M
Matthew Wilcox 已提交
963
}
K
Keith Busch 已提交
964

965
static void nvme_pci_complete_rq(struct request *req)
966
{
C
Christoph Hellwig 已提交
967
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
968
	struct nvme_dev *dev = iod->nvmeq->dev;
M
Matias Bjørling 已提交
969

970 971 972
	if (blk_integrity_rq(req))
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
973
	if (blk_rq_nr_phys_segments(req))
974
		nvme_unmap_data(dev, req);
975
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
976 977
}

978
/* We read the CQE phase first to check if the rest of the entry is valid */
979
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
980
{
K
Keith Busch 已提交
981 982 983
	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];

	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
984 985
}

986
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
987
{
988
	u16 head = nvmeq->cq_head;
989

990 991 992
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
993
}
994

C
Christoph Hellwig 已提交
995 996 997 998 999 1000 1001
static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return nvmeq->dev->admin_tagset.tags[0];
	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}

1002
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
1003
{
K
Keith Busch 已提交
1004
	struct nvme_completion *cqe = &nvmeq->cqes[idx];
1005
	__u16 command_id = READ_ONCE(cqe->command_id);
1006
	struct request *req;
1007

1008 1009 1010 1011 1012 1013
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
1014
	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
1015 1016
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
1017
		return;
1018
	}
M
Matthew Wilcox 已提交
1019

1020
	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id);
1021 1022 1023
	if (unlikely(!req)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
1024
			command_id, le16_to_cpu(cqe->sq_id));
1025 1026 1027
		return;
	}

Y
yupeng 已提交
1028
	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
1029
	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
1030
		nvme_pci_complete_rq(req);
1031
}
M
Matthew Wilcox 已提交
1032

1033 1034
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
1035
	u32 tmp = nvmeq->cq_head + 1;
1036 1037

	if (tmp == nvmeq->q_depth) {
1038
		nvmeq->cq_head = 0;
1039
		nvmeq->cq_phase ^= 1;
1040 1041
	} else {
		nvmeq->cq_head = tmp;
M
Matthew Wilcox 已提交
1042
	}
J
Jens Axboe 已提交
1043 1044
}

1045
static inline int nvme_process_cq(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1046
{
1047
	int found = 0;
M
Matthew Wilcox 已提交
1048

1049
	while (nvme_cqe_pending(nvmeq)) {
1050
		found++;
1051 1052 1053 1054 1055
		/*
		 * load-load control dependency between phase and the rest of
		 * the cqe requires a full read memory barrier
		 */
		dma_rmb();
1056
		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
1057
		nvme_update_cq_head(nvmeq);
1058
	}
1059

1060
	if (found)
1061
		nvme_ring_cq_doorbell(nvmeq);
1062
	return found;
M
Matthew Wilcox 已提交
1063 1064 1065
}

static irqreturn_t nvme_irq(int irq, void *data)
1066 1067
{
	struct nvme_queue *nvmeq = data;
1068

1069
	if (nvme_process_cq(nvmeq))
1070 1071
		return IRQ_HANDLED;
	return IRQ_NONE;
1072 1073 1074 1075 1076
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1077

1078
	if (nvme_cqe_pending(nvmeq))
1079 1080
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1081 1082
}

1083
/*
1084
 * Poll for completions for any interrupt driven queue
1085 1086
 * Can be called from any context.
 */
1087
static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1088
{
1089
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
J
Jens Axboe 已提交
1090

1091
	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
1092

1093 1094 1095
	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
	nvme_process_cq(nvmeq);
	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
J
Jens Axboe 已提交
1096 1097
}

1098
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1099 1100 1101 1102 1103 1104 1105
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1106
	spin_lock(&nvmeq->cq_poll_lock);
1107
	found = nvme_process_cq(nvmeq);
1108
	spin_unlock(&nvmeq->cq_poll_lock);
1109 1110 1111 1112

	return found;
}

1113
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1114
{
1115
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1116
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1117
	struct nvme_command c;
M
Matthew Wilcox 已提交
1118

M
Matias Bjørling 已提交
1119 1120
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1121
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1122
	nvme_submit_cmd(nvmeq, &c, true);
1123 1124
}

M
Matthew Wilcox 已提交
1125
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1126
{
M
Matthew Wilcox 已提交
1127 1128 1129 1130 1131 1132
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1133
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1134 1135 1136
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1137
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1138 1139
{
	struct nvme_command c;
J
Jens Axboe 已提交
1140 1141
	int flags = NVME_QUEUE_PHYS_CONTIG;

1142
	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
J
Jens Axboe 已提交
1143
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1144

1145
	/*
M
Minwoo Im 已提交
1146
	 * Note: we (ab)use the fact that the prp fields survive if no data
1147 1148
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1149 1150 1151 1152 1153 1154
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
1155
	c.create_cq.irq_vector = cpu_to_le16(vector);
M
Matthew Wilcox 已提交
1156

1157
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1158 1159 1160 1161 1162
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1163
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1164
	struct nvme_command c;
1165
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1166

1167 1168 1169 1170 1171 1172 1173 1174
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1175
	/*
M
Minwoo Im 已提交
1176
	 * Note: we (ab)use the fact that the prp fields survive if no data
1177 1178
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1179 1180 1181 1182 1183 1184 1185 1186
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1187
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1200
static void abort_endio(struct request *req, blk_status_t error)
1201
{
C
Christoph Hellwig 已提交
1202 1203
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1204

1205 1206
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1207 1208
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1209 1210
}

K
Keith Busch 已提交
1211 1212 1213 1214 1215 1216 1217
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1218 1219 1220
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1221
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1222
		return false;
1223 1224 1225
	default:
		break;
	}
K
Keith Busch 已提交
1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1254
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1255
{
C
Christoph Hellwig 已提交
1256 1257
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1258
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1259 1260
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1261 1262
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1263 1264 1265 1266 1267 1268 1269
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1270 1271 1272 1273 1274 1275
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1276
		nvme_reset_ctrl(&dev->ctrl);
1277
		return BLK_EH_DONE;
K
Keith Busch 已提交
1278
	}
K
Keith Busch 已提交
1279

K
Keith Busch 已提交
1280 1281 1282
	/*
	 * Did we miss an interrupt?
	 */
1283 1284 1285 1286 1287
	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
		nvme_poll(req->mq_hctx);
	else
		nvme_poll_irqdisable(nvmeq);

1288
	if (blk_mq_request_completed(req)) {
K
Keith Busch 已提交
1289 1290 1291
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1292
		return BLK_EH_DONE;
K
Keith Busch 已提交
1293 1294
	}

1295
	/*
1296 1297 1298
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1299
	 * shutdown, so we return BLK_EH_DONE.
1300
	 */
1301 1302
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
1303
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
1304
		fallthrough;
1305
	case NVME_CTRL_DELETING:
1306
		dev_warn_ratelimited(dev->ctrl.device,
1307 1308
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1309
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1310
		nvme_dev_disable(dev, true);
1311
		return BLK_EH_DONE;
1312 1313
	case NVME_CTRL_RESETTING:
		return BLK_EH_RESET_TIMER;
1314 1315
	default:
		break;
K
Keith Busch 已提交
1316 1317
	}

1318
	/*
B
Baolin Wang 已提交
1319 1320 1321
	 * Shutdown the controller immediately and schedule a reset if the
	 * command was already aborted once before and still hasn't been
	 * returned to the driver, or if this is the admin queue.
1322
	 */
C
Christoph Hellwig 已提交
1323
	if (!nvmeq->qid || iod->aborted) {
1324
		dev_warn(dev->ctrl.device,
1325 1326
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1327
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1328
		nvme_dev_disable(dev, false);
1329
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1330

1331
		return BLK_EH_DONE;
K
Keith Busch 已提交
1332 1333
	}

1334
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1335
		atomic_inc(&dev->ctrl.abort_limit);
1336
		return BLK_EH_RESET_TIMER;
1337
	}
1338
	iod->aborted = 1;
M
Matias Bjørling 已提交
1339

K
Keith Busch 已提交
1340 1341
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1342
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1343 1344
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1345 1346 1347
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1348 1349

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1350
			BLK_MQ_REQ_NOWAIT);
1351 1352 1353 1354 1355 1356
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->end_io_data = NULL;
1357
	blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1358

1359 1360 1361 1362 1363 1364
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1365 1366
}

M
Matias Bjørling 已提交
1367 1368
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1369
	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
1370
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1371 1372
	if (!nvmeq->sq_cmds)
		return;
1373

1374
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1375
		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1376
				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1377
	} else {
1378
		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
1379
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1380
	}
1381 1382
}

1383
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1384 1385 1386
{
	int i;

1387 1388
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1389
		nvme_free_queue(&dev->queues[i]);
1390
	}
1391 1392
}

K
Keith Busch 已提交
1393 1394
/**
 * nvme_suspend_queue - put queue into suspended state
1395
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1396 1397
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1398
{
1399
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1400
		return 1;
1401

1402
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1403
	mb();
1404

1405
	nvmeq->dev->online_queues--;
1406
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1407
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1408 1409
	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
K
Keith Busch 已提交
1410 1411
	return 0;
}
M
Matthew Wilcox 已提交
1412

1413 1414 1415 1416 1417 1418 1419 1420
static void nvme_suspend_io_queues(struct nvme_dev *dev)
{
	int i;

	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
		nvme_suspend_queue(&dev->queues[i]);
}

1421
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1422
{
1423
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1424

1425 1426 1427
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1428
		nvme_disable_ctrl(&dev->ctrl);
1429

1430
	nvme_poll_irqdisable(nvmeq);
M
Matthew Wilcox 已提交
1431 1432
}

1433 1434
/*
 * Called only on a device that has been disabled and after all other threads
1435 1436 1437
 * that can check this device's completion queues have synced, except
 * nvme_poll(). This is the last chance for the driver to see a natural
 * completion before nvme_cancel_request() terminates all incomplete requests.
1438 1439 1440 1441 1442
 */
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
	int i;

1443 1444
	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
		spin_lock(&dev->queues[i].cq_poll_lock);
1445
		nvme_process_cq(&dev->queues[i]);
1446 1447
		spin_unlock(&dev->queues[i].cq_poll_lock);
	}
1448 1449
}

1450 1451 1452 1453
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1454
	unsigned q_size_aligned = roundup(q_depth * entry_size,
1455
					  NVME_CTRL_PAGE_SIZE);
1456 1457

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1458
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1459

1460
		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
1461
		q_depth = div_u64(mem_per_q, entry_size);
1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1476
				int qid)
1477
{
1478 1479 1480
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1481
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
1482 1483 1484 1485 1486 1487 1488 1489
		if (nvmeq->sq_cmds) {
			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
							nvmeq->sq_cmds);
			if (nvmeq->sq_dma_addr) {
				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
				return 0;
			}

1490
			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1491
		}
1492
	}
1493

1494
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
1495
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1496 1497
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1498 1499 1500
	return 0;
}

1501
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1502
{
1503
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1504

1505 1506
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1507

1508
	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
1509 1510
	nvmeq->q_depth = depth;
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
1511
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1512 1513 1514
	if (!nvmeq->cqes)
		goto free_nvmeq;

1515
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
M
Matthew Wilcox 已提交
1516 1517
		goto free_cqdma;

M
Matthew Wilcox 已提交
1518
	nvmeq->dev = dev;
1519
	spin_lock_init(&nvmeq->sq_lock);
1520
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1521
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1522
	nvmeq->cq_phase = 1;
1523
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
K
Keith Busch 已提交
1524
	nvmeq->qid = qid;
1525
	dev->ctrl.queue_count++;
1526

1527
	return 0;
M
Matthew Wilcox 已提交
1528 1529

 free_cqdma:
1530 1531
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
M
Matthew Wilcox 已提交
1532
 free_nvmeq:
1533
	return -ENOMEM;
M
Matthew Wilcox 已提交
1534 1535
}

1536
static int queue_request_irq(struct nvme_queue *nvmeq)
1537
{
1538 1539 1540 1541 1542 1543 1544 1545 1546 1547
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1548 1549
}

1550
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1551
{
1552
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1553

1554
	nvmeq->sq_tail = 0;
1555
	nvmeq->last_sq_tail = 0;
1556 1557
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1558
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1559
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
1560
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1561
	dev->online_queues++;
1562
	wmb(); /* ensure the first interrupt sees the initialization */
1563 1564
}

J
Jens Axboe 已提交
1565
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1566 1567 1568
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1569
	u16 vector = 0;
1570

1571 1572
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1573 1574 1575 1576
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1577 1578 1579
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
1580
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
J
Jens Axboe 已提交
1581

1582
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1583 1584
	if (result)
		return result;
M
Matthew Wilcox 已提交
1585 1586 1587

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1588
		return result;
1589
	if (result)
M
Matthew Wilcox 已提交
1590 1591
		goto release_cq;

1592
	nvmeq->cq_vector = vector;
1593
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1594

1595
	if (!polled) {
J
Jens Axboe 已提交
1596 1597 1598 1599
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1600

1601
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1602
	return result;
M
Matthew Wilcox 已提交
1603

1604
release_sq:
1605
	dev->online_queues--;
M
Matthew Wilcox 已提交
1606
	adapter_delete_sq(dev, qid);
1607
release_cq:
M
Matthew Wilcox 已提交
1608
	adapter_delete_cq(dev, qid);
1609
	return result;
M
Matthew Wilcox 已提交
1610 1611
}

1612
static const struct blk_mq_ops nvme_mq_admin_ops = {
1613
	.queue_rq	= nvme_queue_rq,
1614
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1615
	.init_hctx	= nvme_admin_init_hctx,
1616
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1617 1618 1619
	.timeout	= nvme_timeout,
};

1620
static const struct blk_mq_ops nvme_mq_ops = {
1621 1622 1623 1624 1625 1626 1627 1628
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1629 1630
};

1631 1632
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1633
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1634 1635 1636 1637 1638
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1639
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1640
		blk_cleanup_queue(dev->ctrl.admin_q);
1641 1642 1643 1644
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1645 1646
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1647
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1648 1649
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1650

K
Keith Busch 已提交
1651
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1652
		dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
1653
		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
1654
		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1655
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1656 1657 1658 1659
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1660
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1661

1662 1663
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1664 1665 1666
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1667
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1668
			nvme_dev_remove_admin(dev);
1669
			dev->ctrl.admin_q = NULL;
1670 1671
			return -ENODEV;
		}
K
Keith Busch 已提交
1672
	} else
1673
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1674 1675 1676 1677

	return 0;
}

1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1704
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1705
{
1706
	int result;
M
Matthew Wilcox 已提交
1707 1708 1709
	u32 aqa;
	struct nvme_queue *nvmeq;

1710 1711 1712 1713
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1714
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1715
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1716

1717 1718 1719
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1720

1721
	result = nvme_disable_ctrl(&dev->ctrl);
1722 1723
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1724

1725
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1726 1727
	if (result)
		return result;
M
Matthew Wilcox 已提交
1728

1729 1730
	dev->ctrl.numa_node = dev_to_node(dev->dev);

1731
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1732 1733 1734
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1735 1736 1737
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1738

1739
	result = nvme_enable_ctrl(&dev->ctrl);
1740
	if (result)
K
Keith Busch 已提交
1741
		return result;
M
Matias Bjørling 已提交
1742

K
Keith Busch 已提交
1743
	nvmeq->cq_vector = 0;
1744
	nvme_init_queue(nvmeq, 0);
1745
	result = queue_request_irq(nvmeq);
1746
	if (result) {
1747
		dev->online_queues--;
K
Keith Busch 已提交
1748
		return result;
1749
	}
1750

1751
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1752 1753 1754
	return result;
}

1755
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1756
{
J
Jens Axboe 已提交
1757
	unsigned i, max, rw_queues;
1758
	int ret = 0;
K
Keith Busch 已提交
1759

1760
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1761
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1762
			ret = -ENOMEM;
K
Keith Busch 已提交
1763
			break;
1764 1765
		}
	}
K
Keith Busch 已提交
1766

1767
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1768 1769 1770
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1771 1772 1773 1774
	} else {
		rw_queues = max;
	}

1775
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1776 1777 1778
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1779
		if (ret)
K
Keith Busch 已提交
1780
			break;
M
Matthew Wilcox 已提交
1781
	}
1782 1783 1784

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1785 1786
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1787 1788 1789
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1790 1791
}

1792 1793 1794 1795 1796 1797
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1798
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1799 1800 1801 1802
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1803
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1804
{
1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1815
static void nvme_map_cmb(struct nvme_dev *dev)
1816
{
1817
	u64 size, offset;
1818 1819
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1820
	int bar;
1821

1822 1823 1824
	if (dev->cmb_size)
		return;

1825 1826 1827
	if (NVME_CAP_CMBS(dev->ctrl.cap))
		writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);

1828
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1829 1830
	if (!dev->cmbsz)
		return;
1831
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1832

1833 1834
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1835 1836
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1837 1838

	if (offset > bar_size)
1839
		return;
1840

1841 1842 1843 1844 1845 1846 1847 1848 1849 1850
	/*
	 * Tell the controller about the host side address mapping the CMB,
	 * and enable CMB decoding for the NVMe 1.4+ scheme:
	 */
	if (NVME_CAP_CMBS(dev->ctrl.cap)) {
		hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
			     (pci_bus_address(pdev, bar) + offset),
			     dev->bar + NVME_REG_CMBMSC);
	}

1851 1852 1853 1854 1855 1856 1857 1858
	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1859 1860 1861
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1862
		return;
1863 1864
	}

1865
	dev->cmb_size = size;
1866 1867 1868 1869 1870
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1871 1872 1873 1874 1875

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1876 1877 1878 1879
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1880
	if (dev->cmb_size) {
1881 1882
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1883
		dev->cmb_size = 0;
1884 1885 1886
	}
}

1887 1888
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1889
	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
1890
	u64 dma_addr = dev->host_mem_descs_dma;
1891 1892 1893 1894 1895 1896 1897
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
1898
	c.features.dword12	= cpu_to_le32(host_mem_size);
1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
1918
		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
1919

1920 1921 1922
		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
			       le64_to_cpu(desc->addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1923 1924 1925 1926
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1927 1928 1929
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1930
	dev->host_mem_descs = NULL;
1931
	dev->nr_host_mem_descs = 0;
1932 1933
}

1934 1935
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1936
{
1937
	struct nvme_host_mem_buf_desc *descs;
1938
	u32 max_entries, len;
1939
	dma_addr_t descs_dma;
1940
	int i = 0;
1941
	void **bufs;
1942
	u64 size, tmp;
1943 1944 1945 1946

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1947 1948 1949 1950

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1951 1952
	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
				   &descs_dma, GFP_KERNEL);
1953 1954 1955 1956 1957 1958 1959
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1960
	for (size = 0; size < preferred && i < max_entries; size += len) {
1961 1962
		dma_addr_t dma_addr;

1963
		len = min_t(u64, chunk_size, preferred - size);
1964 1965 1966 1967 1968 1969
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
1970
		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
1971 1972 1973
		i++;
	}

1974
	if (!size)
1975 1976 1977 1978 1979
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1980
	dev->host_mem_descs_dma = descs_dma;
1981 1982 1983 1984 1985
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
1986
		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
1987

1988 1989 1990
		dma_free_attrs(dev->dev, size, bufs[i],
			       le64_to_cpu(descs[i].addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1991 1992 1993 1994
	}

	kfree(bufs);
out_free_descs:
1995 1996
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1997 1998 1999 2000 2001
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

2002 2003
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
2004 2005 2006
	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
	u64 chunk_size;
2007 2008

	/* start big and work our way down */
2009
	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

2020
static int nvme_setup_host_mem(struct nvme_dev *dev)
2021 2022 2023 2024 2025
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
2026
	int ret;
2027 2028 2029 2030 2031 2032 2033

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
2034
		return 0;
2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
2048 2049 2050
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
2051
			return 0; /* controller must work without HMB */
2052 2053 2054 2055 2056
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
2057 2058
	}

2059 2060
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2061
		nvme_free_host_mem(dev);
2062
	return ret;
K
Keith Busch 已提交
2063 2064
}

2065 2066 2067 2068 2069
/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
 */
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
2070
{
2071
	struct nvme_dev *dev = affd->priv;
2072
	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
2073 2074

	/*
B
Baolin Wang 已提交
2075
	 * If there is no interrupt available for queues, ensure that
2076 2077 2078 2079 2080 2081 2082 2083
	 * the default queue is set to 1. The affinity set size is
	 * also set to one, but the irq core ignores it for this case.
	 *
	 * If only one interrupt is available or 'write_queue' == 0, combine
	 * write and read queues.
	 *
	 * If 'write_queues' > 0, ensure it leaves room for at least one read
	 * queue.
2084
	 */
2085 2086 2087
	if (!nrirqs) {
		nrirqs = 1;
		nr_read_queues = 0;
2088
	} else if (nrirqs == 1 || !nr_write_queues) {
2089
		nr_read_queues = 0;
2090
	} else if (nr_write_queues >= nrirqs) {
2091
		nr_read_queues = 1;
2092
	} else {
2093
		nr_read_queues = nrirqs - nr_write_queues;
2094
	}
2095 2096 2097 2098 2099 2100

	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
	affd->nr_sets = nr_read_queues ? 2 : 1;
2101 2102
}

2103
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2104 2105 2106
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	struct irq_affinity affd = {
2107
		.pre_vectors	= 1,
2108 2109
		.calc_sets	= nvme_calc_irq_sets,
		.priv		= dev,
2110
	};
2111
	unsigned int irq_queues, poll_queues;
2112 2113

	/*
2114 2115
	 * Poll queues don't need interrupts, but we need at least one I/O queue
	 * left over for non-polled I/O.
2116
	 */
2117 2118
	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
2119

2120 2121 2122 2123
	/*
	 * Initialize for the single interrupt case, will be updated in
	 * nvme_calc_irq_sets().
	 */
2124 2125
	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
	dev->io_queues[HCTX_TYPE_READ] = 0;
2126

2127
	/*
2128 2129 2130
	 * We need interrupts for the admin queue and each non-polled I/O queue,
	 * but some Apple controllers require all queues to use the first
	 * vector.
2131
	 */
2132 2133 2134
	irq_queues = 1;
	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
		irq_queues += (nr_io_queues - poll_queues);
2135 2136
	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2137 2138
}

2139 2140 2141 2142 2143 2144
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}

2145 2146
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
2147 2148 2149 2150 2151 2152
	/*
	 * If tags are shared with admin queue (Apple bug), then
	 * make sure we only use one IO queue.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
		return 1;
2153 2154 2155
	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
}

2156
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2157
{
2158
	struct nvme_queue *adminq = &dev->queues[0];
2159
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2160
	unsigned int nr_io_queues;
2161
	unsigned long size;
2162
	int result;
M
Matthew Wilcox 已提交
2163

2164 2165 2166 2167 2168 2169
	/*
	 * Sample the module parameters once at reset time so that we have
	 * stable values to work with.
	 */
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
2170

2171
	nr_io_queues = dev->nr_allocated_queues - 1;
C
Christoph Hellwig 已提交
2172 2173
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2174
		return result;
C
Christoph Hellwig 已提交
2175

2176
	if (nr_io_queues == 0)
2177
		return 0;
2178

2179
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2180

2181
	if (dev->cmb_use_sqes) {
2182 2183 2184 2185 2186
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2187
			dev->cmb_use_sqes = false;
2188 2189
	}

2190 2191 2192 2193 2194 2195 2196 2197 2198
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2199

2200
 retry:
K
Keith Busch 已提交
2201
	/* Deregister the admin queue's interrupt */
2202
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2203

2204 2205 2206 2207
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2208
	pci_free_irq_vectors(pdev);
2209 2210

	result = nvme_setup_irqs(dev, nr_io_queues);
2211
	if (result <= 0)
2212
		return -EIO;
2213

2214
	dev->num_vecs = result;
J
Jens Axboe 已提交
2215
	result = max(result - 1, 1);
2216
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2217

2218 2219 2220 2221 2222 2223
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
2224
	result = queue_request_irq(adminq);
2225
	if (result)
K
Keith Busch 已提交
2226
		return result;
2227
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243

	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_disable_io_queues(dev);
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
M
Matthew Wilcox 已提交
2244 2245
}

2246
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2247
{
K
Keith Busch 已提交
2248
	struct nvme_queue *nvmeq = req->end_io_data;
2249

K
Keith Busch 已提交
2250
	blk_mq_free_request(req);
2251
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2252 2253
}

2254
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2255
{
K
Keith Busch 已提交
2256
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2257

2258 2259
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2260 2261

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2262 2263
}

K
Keith Busch 已提交
2264
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2265
{
K
Keith Busch 已提交
2266 2267 2268
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2269

K
Keith Busch 已提交
2270 2271 2272
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2273

2274
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
K
Keith Busch 已提交
2275 2276
	if (IS_ERR(req))
		return PTR_ERR(req);
2277

K
Keith Busch 已提交
2278 2279
	req->end_io_data = nvmeq;

2280
	init_completion(&nvmeq->delete_done);
2281
	blk_execute_rq_nowait(NULL, req, false,
K
Keith Busch 已提交
2282 2283 2284
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2285 2286
}

2287
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2288
{
2289
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2290
	unsigned long timeout;
K
Keith Busch 已提交
2291

K
Keith Busch 已提交
2292
 retry:
2293
	timeout = NVME_ADMIN_TIMEOUT;
2294 2295 2296 2297 2298
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2299
	}
2300 2301 2302 2303
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2304 2305 2306
				timeout);
		if (timeout == 0)
			return false;
2307 2308

		sent--;
2309 2310 2311 2312
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2313 2314
}

K
Keith Busch 已提交
2315
static void nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2316
{
2317 2318
	int ret;

2319
	if (!dev->ctrl.tagset) {
2320
		dev->tagset.ops = &nvme_mq_ops;
2321
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2322
		dev->tagset.nr_maps = 2; /* default + read */
2323 2324
		if (dev->io_queues[HCTX_TYPE_POLL])
			dev->tagset.nr_maps++;
2325
		dev->tagset.timeout = NVME_IO_TIMEOUT;
2326
		dev->tagset.numa_node = dev->ctrl.numa_node;
2327 2328
		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
						BLK_MQ_MAX_DEPTH) - 1;
2329
		dev->tagset.cmd_size = sizeof(struct nvme_iod);
2330 2331
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2332

2333 2334 2335 2336 2337 2338 2339 2340
		/*
		 * Some Apple controllers requires tags to be unique
		 * across admin and IO queue, so reserve the first 32
		 * tags of the IO queue.
		 */
		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
			dev->tagset.reserved_tags = NVME_AQ_DEPTH;

2341 2342 2343 2344
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
K
Keith Busch 已提交
2345
			return;
2346
		}
2347
		dev->ctrl.tagset = &dev->tagset;
2348 2349 2350 2351 2352
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2353
	}
2354

2355
	nvme_dbbuf_set(dev);
M
Matthew Wilcox 已提交
2356 2357
}

2358
static int nvme_pci_enable(struct nvme_dev *dev)
2359
{
2360
	int result = -ENOMEM;
2361
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2362
	int dma_address_bits = 64;
2363 2364 2365 2366 2367 2368

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2369 2370 2371
	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
		dma_address_bits = 48;
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
2372
		goto disable;
2373

2374
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2375
		result = -ENODEV;
2376
		goto disable;
K
Keith Busch 已提交
2377
	}
2378 2379

	/*
2380 2381 2382
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2383
	 */
2384 2385 2386
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2387

2388
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2389

2390
	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2391
				io_queue_depth);
2392
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
2393
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2394
	dev->dbs = dev->bar + 4096;
2395

2396 2397 2398 2399 2400 2401 2402 2403 2404
	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;
2405 2406 2407 2408 2409 2410 2411

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2412 2413
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2414
			dev->q_depth);
2415 2416
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2417
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2418 2419 2420
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2421 2422
	}

2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434
	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}


2435
	nvme_map_cmb(dev);
2436

K
Keith Busch 已提交
2437 2438
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2439 2440 2441 2442 2443 2444 2445 2446
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2447 2448 2449
{
	if (dev->bar)
		iounmap(dev->bar);
2450
	pci_release_mem_regions(to_pci_dev(dev->dev));
2451 2452 2453
}

static void nvme_pci_disable(struct nvme_dev *dev)
2454
{
2455 2456
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2457
	pci_free_irq_vectors(pdev);
2458

K
Keith Busch 已提交
2459 2460
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2461
		pci_disable_device(pdev);
K
Keith Busch 已提交
2462 2463 2464
	}
}

2465
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2466
{
2467
	bool dead = true, freeze = false;
K
Keith Busch 已提交
2468
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2469

2470
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2471 2472 2473
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2474
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
2475 2476
		    dev->ctrl.state == NVME_CTRL_RESETTING) {
			freeze = true;
K
Keith Busch 已提交
2477
			nvme_start_freeze(&dev->ctrl);
2478
		}
K
Keith Busch 已提交
2479 2480
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2481
	}
2482

K
Keith Busch 已提交
2483 2484 2485 2486
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2487 2488
	if (!dead && shutdown && freeze)
		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2489 2490

	nvme_stop_queues(&dev->ctrl);
2491

2492
	if (!dead && dev->ctrl.queue_count > 0) {
2493
		nvme_disable_io_queues(dev);
2494
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2495
	}
2496 2497
	nvme_suspend_io_queues(dev);
	nvme_suspend_queue(&dev->queues[0]);
2498
	nvme_pci_disable(dev);
2499
	nvme_reap_pending_cqes(dev);
2500

2501 2502
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
2503 2504
	blk_mq_tagset_wait_completed_request(&dev->tagset);
	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
K
Keith Busch 已提交
2505 2506 2507 2508 2509 2510

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
2511
	if (shutdown) {
K
Keith Busch 已提交
2512
		nvme_start_queues(&dev->ctrl);
2513 2514 2515
		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
	}
2516
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2517 2518
}

2519 2520 2521 2522 2523 2524 2525 2526
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
	if (!nvme_wait_reset(&dev->ctrl))
		return -EBUSY;
	nvme_dev_disable(dev, shutdown);
	return 0;
}

M
Matthew Wilcox 已提交
2527 2528
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2529
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
C
Christoph Hellwig 已提交
2530 2531
						NVME_CTRL_PAGE_SIZE,
						NVME_CTRL_PAGE_SIZE, 0);
M
Matthew Wilcox 已提交
2532 2533 2534
	if (!dev->prp_page_pool)
		return -ENOMEM;

2535
	/* Optimisation for I/Os between 4k and 128k */
2536
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2537 2538 2539 2540 2541
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2542 2543 2544 2545 2546 2547
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2548
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2549 2550
}

2551 2552 2553 2554 2555 2556 2557
static void nvme_free_tagset(struct nvme_dev *dev)
{
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	dev->ctrl.tagset = NULL;
}

2558
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2559
{
2560
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2561

2562
	nvme_dbbuf_dma_free(dev);
2563
	nvme_free_tagset(dev);
2564 2565
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2566
	free_opal_dev(dev->ctrl.opal_dev);
2567
	mempool_destroy(dev->iod_mempool);
2568 2569
	put_device(dev->dev);
	kfree(dev->queues);
2570 2571 2572
	kfree(dev);
}

2573
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2574
{
2575 2576 2577 2578 2579
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
2580
	nvme_get_ctrl(&dev->ctrl);
2581
	nvme_dev_disable(dev, false);
2582
	nvme_kill_queues(&dev->ctrl);
2583
	if (!queue_work(nvme_wq, &dev->remove_work))
2584 2585 2586
		nvme_put_ctrl(&dev->ctrl);
}

2587
static void nvme_reset_work(struct work_struct *work)
2588
{
2589 2590
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2591
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2592
	int result;
2593

2594 2595
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
		result = -ENODEV;
2596
		goto out;
2597
	}
2598

2599 2600 2601 2602
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2603
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2604
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2605
	nvme_sync_queues(&dev->ctrl);
2606

2607
	mutex_lock(&dev->shutdown_lock);
2608
	result = nvme_pci_enable(dev);
2609
	if (result)
2610
		goto out_unlock;
2611

2612
	result = nvme_pci_configure_admin_queue(dev);
2613
	if (result)
2614
		goto out_unlock;
2615

K
Keith Busch 已提交
2616 2617
	result = nvme_alloc_admin_tags(dev);
	if (result)
2618
		goto out_unlock;
2619

2620 2621 2622 2623
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
2624 2625
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
2626
	dev->ctrl.max_segments = NVME_MAX_SEGS;
2627 2628 2629 2630 2631

	/*
	 * Don't limit the IOMMU merged segment size.
	 */
	dma_set_max_seg_size(dev->dev, 0xffffffff);
J
Jianxiong Gao 已提交
2632
	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
2633

2634 2635 2636 2637 2638 2639 2640 2641 2642
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
2643
		result = -EBUSY;
2644 2645
		goto out;
	}
2646

2647 2648 2649 2650 2651 2652
	/*
	 * We do not support an SGL for metadata (yet), so we are limited to a
	 * single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;

2653
	result = nvme_init_ctrl_finish(&dev->ctrl);
2654
	if (result)
2655
		goto out;
2656

2657 2658 2659 2660 2661 2662 2663 2664 2665
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2666
	}
2667

2668 2669 2670 2671 2672 2673 2674
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2675 2676 2677 2678 2679
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2680

2681
	result = nvme_setup_io_queues(dev);
2682
	if (result)
2683
		goto out;
2684

2685 2686 2687 2688
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2689
	if (dev->online_queues < 2) {
2690
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2691
		nvme_kill_queues(&dev->ctrl);
2692
		nvme_remove_namespaces(&dev->ctrl);
2693
		nvme_free_tagset(dev);
2694
	} else {
2695
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2696
		nvme_wait_freeze(&dev->ctrl);
K
Keith Busch 已提交
2697
		nvme_dev_add(dev);
K
Keith Busch 已提交
2698
		nvme_unfreeze(&dev->ctrl);
2699 2700
	}

2701 2702 2703 2704
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
K
Keith Busch 已提交
2705
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
2706
		dev_warn(dev->ctrl.device,
K
Keith Busch 已提交
2707
			"failed to mark controller live state\n");
2708
		result = -ENODEV;
2709 2710
		goto out;
	}
2711

2712
	nvme_start_ctrl(&dev->ctrl);
2713
	return;
2714

2715 2716
 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
2717
 out:
2718 2719 2720 2721
	if (result)
		dev_warn(dev->ctrl.device,
			 "Removing after probe failure status: %d\n", result);
	nvme_remove_dead_ctrl(dev);
2722 2723
}

2724
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2725
{
2726
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2727
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2728 2729

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2730
		device_release_driver(&pdev->dev);
2731
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2732 2733
}

2734
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2735
{
2736
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2737
	return 0;
T
Tejun Heo 已提交
2738 2739
}

2740
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2741
{
2742 2743 2744
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2745

2746 2747
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
2748
	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
2749
	return 0;
2750 2751
}

2752 2753 2754 2755
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

2756
	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
2757 2758
}

2759
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2760
	.name			= "pcie",
2761
	.module			= THIS_MODULE,
2762 2763
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2764
	.reg_read32		= nvme_pci_reg_read32,
2765
	.reg_write32		= nvme_pci_reg_write32,
2766
	.reg_read64		= nvme_pci_reg_read64,
2767
	.free_ctrl		= nvme_pci_free_ctrl,
2768
	.submit_async_event	= nvme_pci_submit_async_event,
2769
	.get_address		= nvme_pci_get_address,
2770
};
2771

2772 2773 2774 2775
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2776
	if (pci_request_mem_regions(pdev, "nvme"))
2777 2778
		return -ENODEV;

2779
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2780 2781
		goto release;

M
Max Gurtovoy 已提交
2782
	return 0;
2783
  release:
M
Max Gurtovoy 已提交
2784 2785
	pci_release_mem_regions(pdev);
	return -ENODEV;
2786 2787
}

2788
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2803 2804 2805
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2806 2807 2808
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2809 2810
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2811 2812
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2813
			return NVME_QUIRK_NO_APST;
2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825
	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
		/*
		 * Forcing to use host managed nvme power settings for
		 * lowest idle power with quick resume latency on
		 * Samsung and Toshiba SSDs based on suspend behavior
		 * on Coffee Lake board for LENOVO C640
		 */
		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
			return NVME_QUIRK_SIMPLE_SUSPEND;
2826 2827 2828 2829 2830
	}

	return 0;
}

2831 2832 2833
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2834

2835
	flush_work(&dev->ctrl.reset_work);
2836
	flush_work(&dev->ctrl.scan_work);
2837
	nvme_put_ctrl(&dev->ctrl);
2838 2839
}

2840
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2841
{
M
Matias Bjørling 已提交
2842
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2843
	struct nvme_dev *dev;
2844
	unsigned long quirks = id->driver_data;
2845
	size_t alloc_size;
M
Matthew Wilcox 已提交
2846

M
Matias Bjørling 已提交
2847 2848
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2849
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2850 2851

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2852 2853
	if (!dev)
		return -ENOMEM;
2854

2855 2856 2857 2858 2859
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
	dev->queues = kcalloc_node(dev->nr_allocated_queues,
			sizeof(struct nvme_queue), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2860 2861 2862
	if (!dev->queues)
		goto free;

2863
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2864
	pci_set_drvdata(pdev, dev);
2865

2866 2867
	result = nvme_dev_map(dev);
	if (result)
2868
		goto put_pci;
2869

2870
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2871
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2872
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2873

M
Matthew Wilcox 已提交
2874 2875
	result = nvme_setup_prp_pools(dev);
	if (result)
2876
		goto unmap;
2877

2878
	quirks |= check_vendor_combination_bug(pdev);
2879

2880
	if (!noacpi && acpi_storage_d3(&pdev->dev)) {
2881 2882 2883 2884 2885 2886 2887 2888 2889
		/*
		 * Some systems use a bios work around to ask for D3 on
		 * platforms that support kernel managed suspend.
		 */
		dev_info(&pdev->dev,
			 "platform quirk: setting simple suspend\n");
		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
	}

2890 2891 2892 2893
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
2894
	alloc_size = nvme_pci_iod_alloc_size();
2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2906 2907 2908 2909 2910
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2911 2912
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2913
	nvme_reset_ctrl(&dev->ctrl);
2914
	async_schedule(nvme_async_probe, dev);
2915

M
Matthew Wilcox 已提交
2916 2917
	return 0;

2918 2919
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2920
 release_pools:
M
Matthew Wilcox 已提交
2921
	nvme_release_prp_pools(dev);
2922 2923
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2924
 put_pci:
2925
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2926 2927 2928 2929 2930 2931
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2932
static void nvme_reset_prepare(struct pci_dev *pdev)
2933
{
K
Keith Busch 已提交
2934
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2935 2936 2937 2938 2939 2940 2941 2942

	/*
	 * We don't need to check the return value from waiting for the reset
	 * state as pci_dev device lock is held, making it impossible to race
	 * with ->remove().
	 */
	nvme_disable_prepare_reset(dev, false);
	nvme_sync_queues(&dev->ctrl);
2943
}
2944

2945 2946
static void nvme_reset_done(struct pci_dev *pdev)
{
2947
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2948 2949 2950

	if (!nvme_try_sched_reset(&dev->ctrl))
		flush_work(&dev->ctrl.reset_work);
2951 2952
}

2953 2954 2955
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2956

2957
	nvme_disable_prepare_reset(dev, true);
2958 2959
}

2960 2961 2962 2963 2964
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
2965
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2966 2967
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2968

2969
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
2970
	pci_set_drvdata(pdev, NULL);
2971

2972
	if (!pci_device_is_present(pdev)) {
2973
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2974
		nvme_dev_disable(dev, true);
2975
		nvme_dev_remove_admin(dev);
2976
	}
2977

2978
	flush_work(&dev->ctrl.reset_work);
2979 2980
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
2981
	nvme_dev_disable(dev, true);
2982
	nvme_release_cmb(dev);
2983
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
2984
	nvme_dev_remove_admin(dev);
2985
	nvme_free_queues(dev, 0);
K
Keith Busch 已提交
2986
	nvme_release_prp_pools(dev);
2987
	nvme_dev_unmap(dev);
2988
	nvme_uninit_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2989 2990
}

2991
#ifdef CONFIG_PM_SLEEP
2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006
static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
}

static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
{
	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}

static int nvme_resume(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
	struct nvme_ctrl *ctrl = &ndev->ctrl;

3007
	if (ndev->last_ps == U32_MAX ||
3008
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
3009
		return nvme_try_sched_reset(&ndev->ctrl);
3010 3011 3012
	return 0;
}

3013 3014 3015 3016
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);
3017 3018 3019
	struct nvme_ctrl *ctrl = &ndev->ctrl;
	int ret = -EBUSY;

3020 3021
	ndev->last_ps = U32_MAX;

3022 3023 3024 3025 3026 3027 3028
	/*
	 * The platform does not remove power for a kernel managed suspend so
	 * use host managed nvme power settings for lowest idle power if
	 * possible. This should have quicker resume latency than a full device
	 * shutdown.  But if the firmware is involved after the suspend or the
	 * device does not support any non-default power states, shut down the
	 * device fully.
3029 3030 3031 3032 3033
	 *
	 * If ASPM is not enabled for the device, shut down the device and allow
	 * the PCI bus layer to put it into D3 in order to take the PCIe link
	 * down, so as to allow the platform to achieve its minimum low-power
	 * state (which may not be possible if the link is up).
3034 3035 3036 3037 3038
	 *
	 * If a host memory buffer is enabled, shut down the device as the NVMe
	 * specification allows the device to access the host memory buffer in
	 * host DRAM from all power states, but hosts will fail access to DRAM
	 * during S3.
3039
	 */
3040
	if (pm_suspend_via_firmware() || !ctrl->npss ||
3041
	    !pcie_aspm_enabled(pdev) ||
3042
	    ndev->nr_host_mem_descs ||
3043 3044
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);
3045 3046 3047 3048 3049

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

K
Keith Busch 已提交
3050
	if (ctrl->state != NVME_CTRL_LIVE)
3051 3052 3053 3054 3055 3056
		goto unfreeze;

	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;

3057 3058 3059 3060 3061 3062 3063
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);

3064 3065 3066 3067 3068
	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
3069 3070 3071
		/* discard the saved state */
		pci_load_saved_state(pdev, NULL);

3072 3073
		/*
		 * Clearing npss forces a controller reset on resume. The
3074
		 * correct value will be rediscovered then.
3075
		 */
3076
		ret = nvme_disable_prepare_reset(ndev, true);
3077 3078 3079 3080 3081 3082 3083 3084 3085 3086
		ctrl->npss = 0;
	}
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
}

static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3087

3088
	return nvme_disable_prepare_reset(ndev, true);
3089 3090
}

3091
static int nvme_simple_resume(struct device *dev)
3092 3093 3094 3095
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

3096
	return nvme_try_sched_reset(&ndev->ctrl);
3097 3098
}

3099
static const struct dev_pm_ops nvme_dev_pm_ops = {
3100 3101 3102 3103 3104 3105 3106 3107
	.suspend	= nvme_suspend,
	.resume		= nvme_resume,
	.freeze		= nvme_simple_suspend,
	.thaw		= nvme_simple_resume,
	.poweroff	= nvme_simple_suspend,
	.restore	= nvme_simple_resume,
};
#endif /* CONFIG_PM_SLEEP */
M
Matthew Wilcox 已提交
3108

K
Keith Busch 已提交
3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
3123 3124
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
3125
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
3126 3127
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
3128 3129
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
3130 3131 3132 3133 3134 3135 3136 3137 3138
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

3139
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
3140
	pci_restore_state(pdev);
3141
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
3142 3143 3144 3145 3146
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
3147 3148 3149
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
3150 3151
}

3152
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
3153 3154 3155
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
3156 3157
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
3158 3159
};

3160
static const struct pci_device_id nvme_id_table[] = {
3161
	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
3162
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3163
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3164
	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
3165
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3166
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3167
	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
3168
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3169
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3170
	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
3171 3172
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3173
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
3174
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3175
				NVME_QUIRK_MEDIUM_PRIO_SQ |
3176 3177
				NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3178 3179
	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3180
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
3181 3182
		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3183 3184
	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
3185
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
3186 3187
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
				NVME_QUIRK_NO_NS_DESC_LIST, },
3188 3189
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3190 3191
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3192 3193
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3194 3195 3196
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
3197
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
3198
				NVME_QUIRK_DISABLE_WRITE_ZEROES|
3199
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3200 3201
	{ PCI_DEVICE(0x1987, 0x5016),	/* Phison E16 */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3202 3203 3204
	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
C
Christoph Hellwig 已提交
3205 3206 3207 3208
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
3209 3210
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
3211 3212
	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3213 3214 3215
	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3216 3217
	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3218 3219
	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3220 3221
	{ PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3222 3223
	{ PCI_DEVICE(0x2646, 0x2262),   /* KINGSTON SKC2000 NVMe SSD */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3224 3225
	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3238 3239
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
3240
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
3241 3242
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
3243 3244
				NVME_QUIRK_128_BYTES_SQES |
				NVME_QUIRK_SHARED_TAGS },
3245 3246

	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
M
Matthew Wilcox 已提交
3247 3248 3249 3250 3251 3252 3253 3254
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
3255
	.remove		= nvme_remove,
3256
	.shutdown	= nvme_shutdown,
3257
#ifdef CONFIG_PM_SLEEP
3258 3259 3260
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
3261
#endif
3262
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
3263 3264 3265 3266 3267
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
3268 3269 3270
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3271
	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3272

3273
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
3274 3275 3276 3277 3278
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3279
	flush_workqueue(nvme_wq);
M
Matthew Wilcox 已提交
3280 3281 3282 3283
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3284
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3285 3286
module_init(nvme_init);
module_exit(nvme_exit);