pci.c 86.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
M
Matthew Wilcox 已提交
2 3
/*
 * NVM Express device driver
4
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
5 6
 */

7
#include <linux/acpi.h>
K
Keith Busch 已提交
8
#include <linux/aer.h>
9
#include <linux/async.h>
M
Matthew Wilcox 已提交
10
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
11
#include <linux/blk-mq.h>
12
#include <linux/blk-mq-pci.h>
13
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
14 15 16 17 18
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
19
#include <linux/mutex.h>
20
#include <linux/once.h>
M
Matthew Wilcox 已提交
21
#include <linux/pci.h>
22
#include <linux/suspend.h>
K
Keith Busch 已提交
23
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
24
#include <linux/types.h>
25
#include <linux/io-64-nonatomic-lo-hi.h>
26
#include <linux/io-64-nonatomic-hi-lo.h>
27
#include <linux/sed-opal.h>
28
#include <linux/pci-p2pdma.h>
29

Y
yupeng 已提交
30
#include "trace.h"
31 32
#include "nvme.h"

33
#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
34
#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
35

C
Chaitanya Kulkarni 已提交
36
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
37

38 39 40 41 42 43 44
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

45 46 47
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

48
static bool use_cmb_sqes = true;
49
module_param(use_cmb_sqes, bool, 0444);
50 51
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

52 53 54 55
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
56

C
Chaitanya Kulkarni 已提交
57 58 59 60 61 62
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

63 64 65
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
66
	.get = param_get_uint,
67 68
};

69
static unsigned int io_queue_depth = 1024;
70 71 72
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
	unsigned int n;
	int ret;

	ret = kstrtouint(val, 10, &n);
	if (ret != 0 || n > num_possible_cpus())
		return -EINVAL;
	return param_set_uint(val, kp);
}

static const struct kernel_param_ops io_queue_count_ops = {
	.set = io_queue_count_set,
	.get = param_get_uint,
};

89
static unsigned int write_queues;
90
module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
91 92 93 94
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

95
static unsigned int poll_queues;
96
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
J
Jens Axboe 已提交
97 98
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

99 100 101 102
static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");

103 104
struct nvme_dev;
struct nvme_queue;
105

106
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
107
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
108

109 110 111 112
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
113
	struct nvme_queue *queues;
114 115 116 117 118 119 120 121
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
122
	unsigned io_queues[HCTX_MAX_TYPES];
123
	unsigned int num_vecs;
124
	u32 q_depth;
125
	int io_sqes;
126 127
	u32 db_stride;
	void __iomem *bar;
128
	unsigned long bar_mapped_size;
129
	struct work_struct remove_work;
130
	struct mutex shutdown_lock;
131 132
	bool subsystem;
	u64 cmb_size;
133
	bool cmb_use_sqes;
134
	u32 cmbsz;
135
	u32 cmbloc;
136
	struct nvme_ctrl ctrl;
137
	u32 last_ps;
138

139 140
	mempool_t *iod_mempool;

141
	/* shadow doorbell buffer support: */
142 143 144 145
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
146 147 148 149

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
150
	dma_addr_t host_mem_descs_dma;
151 152
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
153 154 155
	unsigned int nr_allocated_queues;
	unsigned int nr_write_queues;
	unsigned int nr_poll_queues;
K
Keith Busch 已提交
156
};
157

158 159
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
160
	int ret;
161
	u32 n;
162

163
	ret = kstrtou32(val, 10, &n);
164 165 166
	if (ret != 0 || n < 2)
		return -EINVAL;

167
	return param_set_uint(val, kp);
168 169
}

170 171 172 173 174 175 176 177 178 179
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

180 181 182 183 184
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
185 186 187 188 189
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
M
Matthew Wilcox 已提交
190
	struct nvme_dev *dev;
191
	spinlock_t sq_lock;
192
	void *sq_cmds;
193 194
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
K
Keith Busch 已提交
195
	struct nvme_completion *cqes;
M
Matthew Wilcox 已提交
196 197 198
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
199
	u32 q_depth;
200
	u16 cq_vector;
M
Matthew Wilcox 已提交
201
	u16 sq_tail;
202
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
203
	u16 cq_head;
K
Keith Busch 已提交
204
	u16 qid;
205
	u8 cq_phase;
206
	u8 sqes;
207 208
	unsigned long flags;
#define NVMEQ_ENABLED		0
209
#define NVMEQ_SQ_CMB		1
210
#define NVMEQ_DELETE_ERROR	2
211
#define NVMEQ_POLLED		3
212 213 214 215
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
216
	struct completion delete_done;
M
Matthew Wilcox 已提交
217 218
};

219
/*
220 221 222 223
 * The nvme_iod describes the data in an I/O.
 *
 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
 * to the actual struct scatterlist.
224 225
 */
struct nvme_iod {
226
	struct nvme_request req;
227
	struct nvme_command cmd;
C
Christoph Hellwig 已提交
228
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
229
	bool use_sgl;
C
Christoph Hellwig 已提交
230
	int aborted;
231 232 233
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	dma_addr_t first_dma;
234
	unsigned int dma_len;	/* length of single DMA segment mapping */
235
	dma_addr_t meta_dma;
C
Christoph Hellwig 已提交
236
	struct scatterlist *sg;
M
Matthew Wilcox 已提交
237 238
};

239
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
240
{
241
	return dev->nr_allocated_queues * 8 * dev->db_stride;
242 243 244 245
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
246
	unsigned int mem_size = nvme_dbbuf_size(dev);
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
271
	unsigned int mem_size = nvme_dbbuf_size(dev);
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

297 298 299 300 301 302 303 304 305 306 307
static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return;

	nvmeq->dbbuf_sq_db = NULL;
	nvmeq->dbbuf_cq_db = NULL;
	nvmeq->dbbuf_sq_ei = NULL;
	nvmeq->dbbuf_cq_ei = NULL;
}

308 309 310
static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;
311
	unsigned int i;
312 313 314 315 316 317 318 319 320 321

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
322
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
323 324
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
325 326 327

		for (i = 1; i <= dev->online_queues; i++)
			nvme_dbbuf_free(&dev->queues[i]);
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

352 353 354 355 356 357 358 359
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

360 361 362 363 364
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
365 366
}

367 368 369 370 371
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
372
static int nvme_pci_npages_prp(void)
373
{
374
	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
375
				      NVME_CTRL_PAGE_SIZE);
376 377 378
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
379 380 381 382
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
383
static int nvme_pci_npages_sgl(void)
384
{
385 386
	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
			PAGE_SIZE);
C
Christoph Hellwig 已提交
387
}
388

389
static size_t nvme_pci_iod_alloc_size(void)
C
Christoph Hellwig 已提交
390
{
391
	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
C
Chaitanya Kulkarni 已提交
392

393 394
	return sizeof(__le64 *) * npages +
		sizeof(struct scatterlist) * NVME_MAX_SEGS;
C
Christoph Hellwig 已提交
395
}
396

M
Matias Bjørling 已提交
397 398
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
399
{
M
Matias Bjørling 已提交
400
	struct nvme_dev *dev = data;
401
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
402

403 404 405
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);

M
Matias Bjørling 已提交
406 407
	hctx->driver_data = nvmeq;
	return 0;
408 409
}

M
Matias Bjørling 已提交
410 411
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
412
{
M
Matias Bjørling 已提交
413
	struct nvme_dev *dev = data;
414
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
415

416
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
417 418
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
419 420
}

421 422
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
423
{
424
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
425
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
426
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
427
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
428 429

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
430
	iod->nvmeq = nvmeq;
431 432

	nvme_req(req)->ctrl = &dev->ctrl;
433
	nvme_req(req)->cmd = &iod->cmd;
M
Matias Bjørling 已提交
434 435 436
	return 0;
}

437 438 439 440 441 442 443 444 445
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

446 447 448
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
449 450 451 452 453 454 455 456
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
457
			BUG_ON(i == HCTX_TYPE_DEFAULT);
458
			continue;
459 460
		}

J
Jens Axboe 已提交
461 462 463 464
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
465
		map->queue_offset = qoff;
466
		if (i != HCTX_TYPE_POLL && offset)
J
Jens Axboe 已提交
467 468 469
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
470 471 472 473 474
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
475 476
}

477 478 479 480
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
481
{
482 483 484 485 486 487 488 489 490
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

491 492 493
	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
494
	nvmeq->last_sq_tail = nvmeq->sq_tail;
495 496
}

M
Matthew Wilcox 已提交
497
/**
498
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
499 500
 * @nvmeq: The queue to use
 * @cmd: The command to send
501
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
502
 */
503 504
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
505
{
506
	spin_lock(&nvmeq->sq_lock);
507 508
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
509 510
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
511
	nvme_write_sq_db(nvmeq, write_sq);
512 513 514 515 516 517 518 519
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
520 521
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
522
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
523 524
}

C
Chaitanya Kulkarni 已提交
525
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
526
{
C
Christoph Hellwig 已提交
527
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
528
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
529 530
}

531 532 533
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
534
	int nseg = blk_rq_nr_phys_segments(req);
535 536
	unsigned int avg_seg_size;

537
	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
538 539 540 541 542 543 544 545 546 547

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

548
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
549
{
550
	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
551 552
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	dma_addr_t dma_addr = iod->first_dma;
553 554
	int i;

555 556 557 558 559 560
	for (i = 0; i < iod->npages; i++) {
		__le64 *prp_list = nvme_pci_iod_list(req)[i];
		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);

		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
		dma_addr = next_dma_addr;
561 562
	}

563
}
564

565 566 567 568 569 570
static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
{
	const int last_sg = SGES_PER_PAGE - 1;
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	dma_addr_t dma_addr = iod->first_dma;
	int i;
571

572 573 574
	for (i = 0; i < iod->npages; i++) {
		struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
		dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
575

576 577 578
		dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
		dma_addr = next_dma_addr;
	}
C
Chaitanya Kulkarni 已提交
579

580
}
C
Chaitanya Kulkarni 已提交
581

582 583 584
static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
585

586 587 588 589 590 591
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
				    rq_dma_dir(req));
	else
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
}
C
Chaitanya Kulkarni 已提交
592

593 594 595
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
596

597 598 599 600
	if (iod->dma_len) {
		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
			       rq_dma_dir(req));
		return;
601
	}
602

603 604 605 606 607 608 609 610 611 612
	WARN_ON_ONCE(!iod->nents);

	nvme_unmap_sg(dev, req);
	if (iod->npages == 0)
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			      iod->first_dma);
	else if (iod->use_sgl)
		nvme_free_sgls(dev, req);
	else
		nvme_free_prps(dev, req);
613
	mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
614 615
}

616 617 618 619 620 621 622 623 624 625 626 627 628 629
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
630 631
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
632
{
C
Christoph Hellwig 已提交
633
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
634
	struct dma_pool *pool;
635
	int length = blk_rq_payload_bytes(req);
636
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
637 638
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
639
	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
640
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
641
	void **list = nvme_pci_iod_list(req);
642
	dma_addr_t prp_dma;
643
	int nprps, i;
M
Matthew Wilcox 已提交
644

645
	length -= (NVME_CTRL_PAGE_SIZE - offset);
646 647
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
648
		goto done;
649
	}
M
Matthew Wilcox 已提交
650

651
	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
652
	if (dma_len) {
653
		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
654 655 656 657 658 659
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

660
	if (length <= NVME_CTRL_PAGE_SIZE) {
661
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
662
		goto done;
663 664
	}

665
	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
666 667
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
668
		iod->npages = 0;
669 670
	} else {
		pool = dev->prp_page_pool;
671
		iod->npages = 1;
672 673
	}

674
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
675
	if (!prp_list) {
676
		iod->first_dma = dma_addr;
677
		iod->npages = -1;
678
		return BLK_STS_RESOURCE;
679
	}
680 681
	list[0] = prp_list;
	iod->first_dma = prp_dma;
682 683
	i = 0;
	for (;;) {
684
		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
685
			__le64 *old_prp_list = prp_list;
686
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
687
			if (!prp_list)
688
				goto free_prps;
689
			list[iod->npages++] = prp_list;
690 691 692
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
693 694
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
695 696 697
		dma_len -= NVME_CTRL_PAGE_SIZE;
		dma_addr += NVME_CTRL_PAGE_SIZE;
		length -= NVME_CTRL_PAGE_SIZE;
698 699 700 701
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
702 703
		if (unlikely(dma_len < 0))
			goto bad_sgl;
704 705 706
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
707
	}
C
Chaitanya Kulkarni 已提交
708 709 710
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
711
	return BLK_STS_OK;
712 713 714 715
free_prps:
	nvme_free_prps(dev, req);
	return BLK_STS_RESOURCE;
bad_sgl:
716 717 718
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
719
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
720 721
}

C
Chaitanya Kulkarni 已提交
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
744
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
745 746 747 748 749 750
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
751
	int i = 0;
C
Chaitanya Kulkarni 已提交
752 753 754 755

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

756
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
787
				goto free_sgls;
C
Chaitanya Kulkarni 已提交
788 789 790 791 792 793 794 795 796

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
797
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
798 799

	return BLK_STS_OK;
800 801 802
free_sgls:
	nvme_free_sgls(dev, req);
	return BLK_STS_RESOURCE;
C
Chaitanya Kulkarni 已提交
803 804
}

805 806 807 808 809
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
810 811
	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
812 813 814 815 816 817 818 819 820

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
	if (bv->bv_len > first_prp_len)
		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
821
	return BLK_STS_OK;
822 823
}

824 825 826 827 828 829 830 831 832 833 834
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

835
	cmnd->flags = NVME_CMD_SGL_METABUF;
836 837 838
	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
839
	return BLK_STS_OK;
840 841
}

842
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
843
		struct nvme_command *cmnd)
844
{
C
Christoph Hellwig 已提交
845
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
846
	blk_status_t ret = BLK_STS_RESOURCE;
847
	int nr_mapped;
848

849 850 851 852
	if (blk_rq_nr_phys_segments(req) == 1) {
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
853
			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
854 855
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
856

857
			if (iod->nvmeq->qid && sgl_threshold &&
858 859 860
			    dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
				return nvme_setup_sgl_simple(dev, req,
							     &cmnd->rw, &bv);
861 862 863 864
		}
	}

	iod->dma_len = 0;
865 866 867
	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
	if (!iod->sg)
		return BLK_STS_RESOURCE;
868
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
869
	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
C
Christoph Hellwig 已提交
870
	if (!iod->nents)
871
		goto out_free_sg;
872

873
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
874 875
		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
876 877
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
878
					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
879
	if (!nr_mapped)
880
		goto out_free_sg;
881

882
	iod->use_sgl = nvme_pci_use_sgls(dev, req);
883
	if (iod->use_sgl)
884
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
885 886
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
887
	if (ret != BLK_STS_OK)
888 889 890 891 892 893 894
		goto out_unmap_sg;
	return BLK_STS_OK;

out_unmap_sg:
	nvme_unmap_sg(dev, req);
out_free_sg:
	mempool_free(iod->sg, dev->iod_mempool);
895 896
	return ret;
}
897

898 899 900 901
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
902

903 904 905 906 907
	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
			rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
		return BLK_STS_IOERR;
	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
908
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
909 910
}

911 912 913
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
914
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
915
			 const struct blk_mq_queue_data *bd)
916
{
M
Matias Bjørling 已提交
917 918
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
919
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
920
	struct request *req = bd->rq;
921
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
922
	struct nvme_command *cmnd = &iod->cmd;
923
	blk_status_t ret;
K
Keith Busch 已提交
924

925 926 927 928
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;

929 930 931 932
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
933
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
934 935
		return BLK_STS_IOERR;

936
	ret = nvme_setup_cmd(ns, req);
937
	if (ret)
C
Christoph Hellwig 已提交
938
		return ret;
M
Matias Bjørling 已提交
939

940
	if (blk_rq_nr_phys_segments(req)) {
941
		ret = nvme_map_data(dev, req, cmnd);
942
		if (ret)
943
			goto out_free_cmd;
944
	}
M
Matias Bjørling 已提交
945

946
	if (blk_integrity_rq(req)) {
947
		ret = nvme_map_metadata(dev, req, cmnd);
948 949 950 951
		if (ret)
			goto out_unmap_data;
	}

952
	blk_mq_start_request(req);
953
	nvme_submit_cmd(nvmeq, cmnd, bd->last);
954
	return BLK_STS_OK;
955 956
out_unmap_data:
	nvme_unmap_data(dev, req);
957 958
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
959
	return ret;
M
Matthew Wilcox 已提交
960
}
K
Keith Busch 已提交
961

962
static void nvme_pci_complete_rq(struct request *req)
963
{
C
Christoph Hellwig 已提交
964
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
965
	struct nvme_dev *dev = iod->nvmeq->dev;
M
Matias Bjørling 已提交
966

967 968 969
	if (blk_integrity_rq(req))
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
970
	if (blk_rq_nr_phys_segments(req))
971
		nvme_unmap_data(dev, req);
972
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
973 974
}

975
/* We read the CQE phase first to check if the rest of the entry is valid */
976
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
977
{
K
Keith Busch 已提交
978 979 980
	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];

	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
981 982
}

983
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
984
{
985
	u16 head = nvmeq->cq_head;
986

987 988 989
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
990
}
991

C
Christoph Hellwig 已提交
992 993 994 995 996 997 998
static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return nvmeq->dev->admin_tagset.tags[0];
	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}

999
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
1000
{
K
Keith Busch 已提交
1001
	struct nvme_completion *cqe = &nvmeq->cqes[idx];
1002
	__u16 command_id = READ_ONCE(cqe->command_id);
1003
	struct request *req;
1004

1005 1006 1007 1008 1009 1010
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
1011
	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
1012 1013
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
1014
		return;
1015
	}
M
Matthew Wilcox 已提交
1016

1017
	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), command_id);
1018 1019 1020
	if (unlikely(!req)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
1021
			command_id, le16_to_cpu(cqe->sq_id));
1022 1023 1024
		return;
	}

Y
yupeng 已提交
1025
	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
1026
	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
1027
		nvme_pci_complete_rq(req);
1028
}
M
Matthew Wilcox 已提交
1029

1030 1031
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
1032 1033 1034
	u16 tmp = nvmeq->cq_head + 1;

	if (tmp == nvmeq->q_depth) {
1035
		nvmeq->cq_head = 0;
1036
		nvmeq->cq_phase ^= 1;
1037 1038
	} else {
		nvmeq->cq_head = tmp;
M
Matthew Wilcox 已提交
1039
	}
J
Jens Axboe 已提交
1040 1041
}

1042
static inline int nvme_process_cq(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1043
{
1044
	int found = 0;
M
Matthew Wilcox 已提交
1045

1046
	while (nvme_cqe_pending(nvmeq)) {
1047
		found++;
1048 1049 1050 1051 1052
		/*
		 * load-load control dependency between phase and the rest of
		 * the cqe requires a full read memory barrier
		 */
		dma_rmb();
1053
		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
1054
		nvme_update_cq_head(nvmeq);
1055
	}
1056

1057
	if (found)
1058
		nvme_ring_cq_doorbell(nvmeq);
1059
	return found;
M
Matthew Wilcox 已提交
1060 1061 1062
}

static irqreturn_t nvme_irq(int irq, void *data)
1063 1064
{
	struct nvme_queue *nvmeq = data;
1065

1066
	if (nvme_process_cq(nvmeq))
1067 1068
		return IRQ_HANDLED;
	return IRQ_NONE;
1069 1070 1071 1072 1073
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1074

1075
	if (nvme_cqe_pending(nvmeq))
1076 1077
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1078 1079
}

1080
/*
1081
 * Poll for completions for any interrupt driven queue
1082 1083
 * Can be called from any context.
 */
1084
static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1085
{
1086
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
J
Jens Axboe 已提交
1087

1088
	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
1089

1090 1091 1092
	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
	nvme_process_cq(nvmeq);
	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
J
Jens Axboe 已提交
1093 1094
}

1095
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1096 1097 1098 1099 1100 1101 1102
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1103
	spin_lock(&nvmeq->cq_poll_lock);
1104
	found = nvme_process_cq(nvmeq);
1105
	spin_unlock(&nvmeq->cq_poll_lock);
1106 1107 1108 1109

	return found;
}

1110
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1111
{
1112
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1113
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1114
	struct nvme_command c;
M
Matthew Wilcox 已提交
1115

M
Matias Bjørling 已提交
1116 1117
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1118
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1119
	nvme_submit_cmd(nvmeq, &c, true);
1120 1121
}

M
Matthew Wilcox 已提交
1122
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1123
{
M
Matthew Wilcox 已提交
1124 1125 1126 1127 1128 1129
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1130
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1131 1132 1133
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1134
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1135 1136
{
	struct nvme_command c;
J
Jens Axboe 已提交
1137 1138
	int flags = NVME_QUEUE_PHYS_CONTIG;

1139
	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
J
Jens Axboe 已提交
1140
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1141

1142
	/*
M
Minwoo Im 已提交
1143
	 * Note: we (ab)use the fact that the prp fields survive if no data
1144 1145
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1146 1147 1148 1149 1150 1151
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
1152
	c.create_cq.irq_vector = cpu_to_le16(vector);
M
Matthew Wilcox 已提交
1153

1154
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1155 1156 1157 1158 1159
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1160
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1161
	struct nvme_command c;
1162
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1163

1164 1165 1166 1167 1168 1169 1170 1171
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1172
	/*
M
Minwoo Im 已提交
1173
	 * Note: we (ab)use the fact that the prp fields survive if no data
1174 1175
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1176 1177 1178 1179 1180 1181 1182 1183
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1184
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1197
static void abort_endio(struct request *req, blk_status_t error)
1198
{
C
Christoph Hellwig 已提交
1199 1200
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1201

1202 1203
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1204 1205
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1206 1207
}

K
Keith Busch 已提交
1208 1209 1210 1211 1212 1213 1214
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1215 1216 1217
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1218
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1219
		return false;
1220 1221 1222
	default:
		break;
	}
K
Keith Busch 已提交
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1251
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1252
{
C
Christoph Hellwig 已提交
1253 1254
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1255
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1256 1257
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1258 1259
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1260 1261 1262 1263 1264 1265 1266
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1267 1268 1269 1270 1271 1272
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1273
		nvme_reset_ctrl(&dev->ctrl);
1274
		return BLK_EH_DONE;
K
Keith Busch 已提交
1275
	}
K
Keith Busch 已提交
1276

K
Keith Busch 已提交
1277 1278 1279
	/*
	 * Did we miss an interrupt?
	 */
1280 1281 1282 1283 1284
	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
		nvme_poll(req->mq_hctx);
	else
		nvme_poll_irqdisable(nvmeq);

1285
	if (blk_mq_request_completed(req)) {
K
Keith Busch 已提交
1286 1287 1288
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1289
		return BLK_EH_DONE;
K
Keith Busch 已提交
1290 1291
	}

1292
	/*
1293 1294 1295
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1296
	 * shutdown, so we return BLK_EH_DONE.
1297
	 */
1298 1299
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
1300
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
1301
		fallthrough;
1302
	case NVME_CTRL_DELETING:
1303
		dev_warn_ratelimited(dev->ctrl.device,
1304 1305
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1306
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1307
		nvme_dev_disable(dev, true);
1308
		return BLK_EH_DONE;
1309 1310
	case NVME_CTRL_RESETTING:
		return BLK_EH_RESET_TIMER;
1311 1312
	default:
		break;
K
Keith Busch 已提交
1313 1314
	}

1315
	/*
B
Baolin Wang 已提交
1316 1317 1318
	 * Shutdown the controller immediately and schedule a reset if the
	 * command was already aborted once before and still hasn't been
	 * returned to the driver, or if this is the admin queue.
1319
	 */
C
Christoph Hellwig 已提交
1320
	if (!nvmeq->qid || iod->aborted) {
1321
		dev_warn(dev->ctrl.device,
1322 1323
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1324
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1325
		nvme_dev_disable(dev, false);
1326
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1327

1328
		return BLK_EH_DONE;
K
Keith Busch 已提交
1329 1330
	}

1331
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1332
		atomic_inc(&dev->ctrl.abort_limit);
1333
		return BLK_EH_RESET_TIMER;
1334
	}
1335
	iod->aborted = 1;
M
Matias Bjørling 已提交
1336

K
Keith Busch 已提交
1337 1338
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1339
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1340 1341
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1342 1343 1344
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1345 1346

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1347
			BLK_MQ_REQ_NOWAIT);
1348 1349 1350 1351 1352 1353
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->end_io_data = NULL;
1354
	blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1355

1356 1357 1358 1359 1360 1361
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1362 1363
}

M
Matias Bjørling 已提交
1364 1365
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1366
	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
1367
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1368 1369
	if (!nvmeq->sq_cmds)
		return;
1370

1371
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1372
		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1373
				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1374
	} else {
1375
		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
1376
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1377
	}
1378 1379
}

1380
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1381 1382 1383
{
	int i;

1384 1385
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1386
		nvme_free_queue(&dev->queues[i]);
1387
	}
1388 1389
}

K
Keith Busch 已提交
1390 1391
/**
 * nvme_suspend_queue - put queue into suspended state
1392
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1393 1394
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1395
{
1396
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1397
		return 1;
1398

1399
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1400
	mb();
1401

1402
	nvmeq->dev->online_queues--;
1403
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1404
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1405 1406
	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
K
Keith Busch 已提交
1407 1408
	return 0;
}
M
Matthew Wilcox 已提交
1409

1410 1411 1412 1413 1414 1415 1416 1417
static void nvme_suspend_io_queues(struct nvme_dev *dev)
{
	int i;

	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
		nvme_suspend_queue(&dev->queues[i]);
}

1418
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1419
{
1420
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1421

1422 1423 1424
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1425
		nvme_disable_ctrl(&dev->ctrl);
1426

1427
	nvme_poll_irqdisable(nvmeq);
M
Matthew Wilcox 已提交
1428 1429
}

1430 1431
/*
 * Called only on a device that has been disabled and after all other threads
1432 1433 1434
 * that can check this device's completion queues have synced, except
 * nvme_poll(). This is the last chance for the driver to see a natural
 * completion before nvme_cancel_request() terminates all incomplete requests.
1435 1436 1437 1438 1439
 */
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
	int i;

1440 1441
	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
		spin_lock(&dev->queues[i].cq_poll_lock);
1442
		nvme_process_cq(&dev->queues[i]);
1443 1444
		spin_unlock(&dev->queues[i].cq_poll_lock);
	}
1445 1446
}

1447 1448 1449 1450
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1451
	unsigned q_size_aligned = roundup(q_depth * entry_size,
1452
					  NVME_CTRL_PAGE_SIZE);
1453 1454

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1455
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1456

1457
		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
1458
		q_depth = div_u64(mem_per_q, entry_size);
1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1473
				int qid)
1474
{
1475 1476 1477
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1478
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
1479 1480 1481 1482 1483 1484 1485 1486
		if (nvmeq->sq_cmds) {
			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
							nvmeq->sq_cmds);
			if (nvmeq->sq_dma_addr) {
				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
				return 0;
			}

1487
			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1488
		}
1489
	}
1490

1491
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
1492
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1493 1494
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1495 1496 1497
	return 0;
}

1498
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1499
{
1500
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1501

1502 1503
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1504

1505
	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
1506 1507
	nvmeq->q_depth = depth;
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
1508
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1509 1510 1511
	if (!nvmeq->cqes)
		goto free_nvmeq;

1512
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
M
Matthew Wilcox 已提交
1513 1514
		goto free_cqdma;

M
Matthew Wilcox 已提交
1515
	nvmeq->dev = dev;
1516
	spin_lock_init(&nvmeq->sq_lock);
1517
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1518
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1519
	nvmeq->cq_phase = 1;
1520
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
K
Keith Busch 已提交
1521
	nvmeq->qid = qid;
1522
	dev->ctrl.queue_count++;
1523

1524
	return 0;
M
Matthew Wilcox 已提交
1525 1526

 free_cqdma:
1527 1528
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
M
Matthew Wilcox 已提交
1529
 free_nvmeq:
1530
	return -ENOMEM;
M
Matthew Wilcox 已提交
1531 1532
}

1533
static int queue_request_irq(struct nvme_queue *nvmeq)
1534
{
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1545 1546
}

1547
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1548
{
1549
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1550

1551
	nvmeq->sq_tail = 0;
1552
	nvmeq->last_sq_tail = 0;
1553 1554
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1555
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1556
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
1557
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1558
	dev->online_queues++;
1559
	wmb(); /* ensure the first interrupt sees the initialization */
1560 1561
}

J
Jens Axboe 已提交
1562
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1563 1564 1565
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1566
	u16 vector = 0;
1567

1568 1569
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1570 1571 1572 1573
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1574 1575 1576
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
1577
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
J
Jens Axboe 已提交
1578

1579
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1580 1581
	if (result)
		return result;
M
Matthew Wilcox 已提交
1582 1583 1584

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1585
		return result;
1586
	if (result)
M
Matthew Wilcox 已提交
1587 1588
		goto release_cq;

1589
	nvmeq->cq_vector = vector;
1590
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1591

1592
	if (!polled) {
J
Jens Axboe 已提交
1593 1594 1595 1596
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1597

1598
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1599
	return result;
M
Matthew Wilcox 已提交
1600

1601
release_sq:
1602
	dev->online_queues--;
M
Matthew Wilcox 已提交
1603
	adapter_delete_sq(dev, qid);
1604
release_cq:
M
Matthew Wilcox 已提交
1605
	adapter_delete_cq(dev, qid);
1606
	return result;
M
Matthew Wilcox 已提交
1607 1608
}

1609
static const struct blk_mq_ops nvme_mq_admin_ops = {
1610
	.queue_rq	= nvme_queue_rq,
1611
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1612
	.init_hctx	= nvme_admin_init_hctx,
1613
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1614 1615 1616
	.timeout	= nvme_timeout,
};

1617
static const struct blk_mq_ops nvme_mq_ops = {
1618 1619 1620 1621 1622 1623 1624 1625
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1626 1627
};

1628 1629
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1630
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1631 1632 1633 1634 1635
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1636
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1637
		blk_cleanup_queue(dev->ctrl.admin_q);
1638 1639 1640 1641
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1642 1643
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1644
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1645 1646
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1647

K
Keith Busch 已提交
1648
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1649
		dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
1650
		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
1651
		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1652
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1653 1654 1655 1656
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1657
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1658

1659 1660
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1661 1662 1663
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1664
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1665
			nvme_dev_remove_admin(dev);
1666
			dev->ctrl.admin_q = NULL;
1667 1668
			return -ENODEV;
		}
K
Keith Busch 已提交
1669
	} else
1670
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1671 1672 1673 1674

	return 0;
}

1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1701
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1702
{
1703
	int result;
M
Matthew Wilcox 已提交
1704 1705 1706
	u32 aqa;
	struct nvme_queue *nvmeq;

1707 1708 1709 1710
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1711
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1712
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1713

1714 1715 1716
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1717

1718
	result = nvme_disable_ctrl(&dev->ctrl);
1719 1720
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1721

1722
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1723 1724
	if (result)
		return result;
M
Matthew Wilcox 已提交
1725

1726 1727
	dev->ctrl.numa_node = dev_to_node(dev->dev);

1728
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1729 1730 1731
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1732 1733 1734
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1735

1736
	result = nvme_enable_ctrl(&dev->ctrl);
1737
	if (result)
K
Keith Busch 已提交
1738
		return result;
M
Matias Bjørling 已提交
1739

K
Keith Busch 已提交
1740
	nvmeq->cq_vector = 0;
1741
	nvme_init_queue(nvmeq, 0);
1742
	result = queue_request_irq(nvmeq);
1743
	if (result) {
1744
		dev->online_queues--;
K
Keith Busch 已提交
1745
		return result;
1746
	}
1747

1748
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1749 1750 1751
	return result;
}

1752
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1753
{
J
Jens Axboe 已提交
1754
	unsigned i, max, rw_queues;
1755
	int ret = 0;
K
Keith Busch 已提交
1756

1757
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1758
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1759
			ret = -ENOMEM;
K
Keith Busch 已提交
1760
			break;
1761 1762
		}
	}
K
Keith Busch 已提交
1763

1764
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1765 1766 1767
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1768 1769 1770 1771
	} else {
		rw_queues = max;
	}

1772
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1773 1774 1775
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1776
		if (ret)
K
Keith Busch 已提交
1777
			break;
M
Matthew Wilcox 已提交
1778
	}
1779 1780 1781

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1782 1783
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1784 1785 1786
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1787 1788
}

1789 1790 1791 1792 1793 1794
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1795
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1796 1797 1798 1799
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1800
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1801
{
1802 1803 1804 1805 1806 1807 1808 1809 1810 1811
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1812
static void nvme_map_cmb(struct nvme_dev *dev)
1813
{
1814
	u64 size, offset;
1815 1816
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1817
	int bar;
1818

1819 1820 1821
	if (dev->cmb_size)
		return;

1822 1823 1824
	if (NVME_CAP_CMBS(dev->ctrl.cap))
		writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);

1825
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1826 1827
	if (!dev->cmbsz)
		return;
1828
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1829

1830 1831
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1832 1833
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1834 1835

	if (offset > bar_size)
1836
		return;
1837

1838 1839 1840 1841 1842 1843 1844 1845 1846 1847
	/*
	 * Tell the controller about the host side address mapping the CMB,
	 * and enable CMB decoding for the NVMe 1.4+ scheme:
	 */
	if (NVME_CAP_CMBS(dev->ctrl.cap)) {
		hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
			     (pci_bus_address(pdev, bar) + offset),
			     dev->bar + NVME_REG_CMBMSC);
	}

1848 1849 1850 1851 1852 1853 1854 1855
	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1856 1857 1858
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1859
		return;
1860 1861
	}

1862
	dev->cmb_size = size;
1863 1864 1865 1866 1867
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1868 1869 1870 1871 1872

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1873 1874 1875 1876
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1877
	if (dev->cmb_size) {
1878 1879
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1880
		dev->cmb_size = 0;
1881 1882 1883
	}
}

1884 1885
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1886
	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
1887
	u64 dma_addr = dev->host_mem_descs_dma;
1888 1889 1890 1891 1892 1893 1894
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
1895
	c.features.dword12	= cpu_to_le32(host_mem_size);
1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
1915
		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
1916

1917 1918 1919
		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
			       le64_to_cpu(desc->addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1920 1921 1922 1923
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1924 1925 1926
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1927
	dev->host_mem_descs = NULL;
1928
	dev->nr_host_mem_descs = 0;
1929 1930
}

1931 1932
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1933
{
1934
	struct nvme_host_mem_buf_desc *descs;
1935
	u32 max_entries, len;
1936
	dma_addr_t descs_dma;
1937
	int i = 0;
1938
	void **bufs;
1939
	u64 size, tmp;
1940 1941 1942 1943

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1944 1945 1946 1947

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1948 1949
	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
				   &descs_dma, GFP_KERNEL);
1950 1951 1952 1953 1954 1955 1956
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1957
	for (size = 0; size < preferred && i < max_entries; size += len) {
1958 1959
		dma_addr_t dma_addr;

1960
		len = min_t(u64, chunk_size, preferred - size);
1961 1962 1963 1964 1965 1966
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
1967
		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
1968 1969 1970
		i++;
	}

1971
	if (!size)
1972 1973 1974 1975 1976
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1977
	dev->host_mem_descs_dma = descs_dma;
1978 1979 1980 1981 1982
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
1983
		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
1984

1985 1986 1987
		dma_free_attrs(dev->dev, size, bufs[i],
			       le64_to_cpu(descs[i].addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1988 1989 1990 1991
	}

	kfree(bufs);
out_free_descs:
1992 1993
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1994 1995 1996 1997 1998
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1999 2000
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
2001 2002 2003
	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
	u64 chunk_size;
2004 2005

	/* start big and work our way down */
2006
	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

2017
static int nvme_setup_host_mem(struct nvme_dev *dev)
2018 2019 2020 2021 2022
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
2023
	int ret;
2024 2025 2026 2027 2028 2029 2030

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
2031
		return 0;
2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
2045 2046 2047
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
2048
			return 0; /* controller must work without HMB */
2049 2050 2051 2052 2053
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
2054 2055
	}

2056 2057
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2058
		nvme_free_host_mem(dev);
2059
	return ret;
K
Keith Busch 已提交
2060 2061
}

2062 2063 2064 2065 2066
/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
 */
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
2067
{
2068
	struct nvme_dev *dev = affd->priv;
2069
	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
2070 2071

	/*
B
Baolin Wang 已提交
2072
	 * If there is no interrupt available for queues, ensure that
2073 2074 2075 2076 2077 2078 2079 2080
	 * the default queue is set to 1. The affinity set size is
	 * also set to one, but the irq core ignores it for this case.
	 *
	 * If only one interrupt is available or 'write_queue' == 0, combine
	 * write and read queues.
	 *
	 * If 'write_queues' > 0, ensure it leaves room for at least one read
	 * queue.
2081
	 */
2082 2083 2084
	if (!nrirqs) {
		nrirqs = 1;
		nr_read_queues = 0;
2085
	} else if (nrirqs == 1 || !nr_write_queues) {
2086
		nr_read_queues = 0;
2087
	} else if (nr_write_queues >= nrirqs) {
2088
		nr_read_queues = 1;
2089
	} else {
2090
		nr_read_queues = nrirqs - nr_write_queues;
2091
	}
2092 2093 2094 2095 2096 2097

	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
	affd->nr_sets = nr_read_queues ? 2 : 1;
2098 2099
}

2100
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2101 2102 2103
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	struct irq_affinity affd = {
2104
		.pre_vectors	= 1,
2105 2106
		.calc_sets	= nvme_calc_irq_sets,
		.priv		= dev,
2107
	};
2108
	unsigned int irq_queues, poll_queues;
2109 2110

	/*
2111 2112
	 * Poll queues don't need interrupts, but we need at least one I/O queue
	 * left over for non-polled I/O.
2113
	 */
2114 2115
	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
2116

2117 2118 2119 2120
	/*
	 * Initialize for the single interrupt case, will be updated in
	 * nvme_calc_irq_sets().
	 */
2121 2122
	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
	dev->io_queues[HCTX_TYPE_READ] = 0;
2123

2124
	/*
2125 2126 2127
	 * We need interrupts for the admin queue and each non-polled I/O queue,
	 * but some Apple controllers require all queues to use the first
	 * vector.
2128
	 */
2129 2130 2131
	irq_queues = 1;
	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
		irq_queues += (nr_io_queues - poll_queues);
2132 2133
	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2134 2135
}

2136 2137 2138 2139 2140 2141
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}

2142 2143
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
2144 2145 2146 2147 2148 2149
	/*
	 * If tags are shared with admin queue (Apple bug), then
	 * make sure we only use one IO queue.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
		return 1;
2150 2151 2152
	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
}

2153
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2154
{
2155
	struct nvme_queue *adminq = &dev->queues[0];
2156
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2157
	unsigned int nr_io_queues;
2158
	unsigned long size;
2159
	int result;
M
Matthew Wilcox 已提交
2160

2161 2162 2163 2164 2165 2166
	/*
	 * Sample the module parameters once at reset time so that we have
	 * stable values to work with.
	 */
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
2167

2168
	nr_io_queues = dev->nr_allocated_queues - 1;
C
Christoph Hellwig 已提交
2169 2170
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2171
		return result;
C
Christoph Hellwig 已提交
2172

2173
	if (nr_io_queues == 0)
2174
		return 0;
2175

2176
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2177

2178
	if (dev->cmb_use_sqes) {
2179 2180 2181 2182 2183
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2184
			dev->cmb_use_sqes = false;
2185 2186
	}

2187 2188 2189 2190 2191 2192 2193 2194 2195
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2196

2197
 retry:
K
Keith Busch 已提交
2198
	/* Deregister the admin queue's interrupt */
2199
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2200

2201 2202 2203 2204
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2205
	pci_free_irq_vectors(pdev);
2206 2207

	result = nvme_setup_irqs(dev, nr_io_queues);
2208
	if (result <= 0)
2209
		return -EIO;
2210

2211
	dev->num_vecs = result;
J
Jens Axboe 已提交
2212
	result = max(result - 1, 1);
2213
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2214

2215 2216 2217 2218 2219 2220
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
2221
	result = queue_request_irq(adminq);
2222
	if (result)
K
Keith Busch 已提交
2223
		return result;
2224
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240

	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_disable_io_queues(dev);
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
M
Matthew Wilcox 已提交
2241 2242
}

2243
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2244
{
K
Keith Busch 已提交
2245
	struct nvme_queue *nvmeq = req->end_io_data;
2246

K
Keith Busch 已提交
2247
	blk_mq_free_request(req);
2248
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2249 2250
}

2251
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2252
{
K
Keith Busch 已提交
2253
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2254

2255 2256
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2257 2258

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2259 2260
}

K
Keith Busch 已提交
2261
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2262
{
K
Keith Busch 已提交
2263 2264 2265
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2266

K
Keith Busch 已提交
2267 2268 2269
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2270

2271
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
K
Keith Busch 已提交
2272 2273
	if (IS_ERR(req))
		return PTR_ERR(req);
2274

K
Keith Busch 已提交
2275 2276
	req->end_io_data = nvmeq;

2277
	init_completion(&nvmeq->delete_done);
2278
	blk_execute_rq_nowait(NULL, req, false,
K
Keith Busch 已提交
2279 2280 2281
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2282 2283
}

2284
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2285
{
2286
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2287
	unsigned long timeout;
K
Keith Busch 已提交
2288

K
Keith Busch 已提交
2289
 retry:
2290
	timeout = NVME_ADMIN_TIMEOUT;
2291 2292 2293 2294 2295
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2296
	}
2297 2298 2299 2300
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2301 2302 2303
				timeout);
		if (timeout == 0)
			return false;
2304 2305

		sent--;
2306 2307 2308 2309
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2310 2311
}

K
Keith Busch 已提交
2312
static void nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2313
{
2314 2315
	int ret;

2316
	if (!dev->ctrl.tagset) {
2317
		dev->tagset.ops = &nvme_mq_ops;
2318
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2319
		dev->tagset.nr_maps = 2; /* default + read */
2320 2321
		if (dev->io_queues[HCTX_TYPE_POLL])
			dev->tagset.nr_maps++;
2322
		dev->tagset.timeout = NVME_IO_TIMEOUT;
2323
		dev->tagset.numa_node = dev->ctrl.numa_node;
2324 2325
		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
						BLK_MQ_MAX_DEPTH) - 1;
2326
		dev->tagset.cmd_size = sizeof(struct nvme_iod);
2327 2328
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2329

2330 2331 2332 2333 2334 2335 2336 2337
		/*
		 * Some Apple controllers requires tags to be unique
		 * across admin and IO queue, so reserve the first 32
		 * tags of the IO queue.
		 */
		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
			dev->tagset.reserved_tags = NVME_AQ_DEPTH;

2338 2339 2340 2341
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
K
Keith Busch 已提交
2342
			return;
2343
		}
2344
		dev->ctrl.tagset = &dev->tagset;
2345 2346 2347 2348 2349
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2350
	}
2351

2352
	nvme_dbbuf_set(dev);
M
Matthew Wilcox 已提交
2353 2354
}

2355
static int nvme_pci_enable(struct nvme_dev *dev)
2356
{
2357
	int result = -ENOMEM;
2358
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2359
	int dma_address_bits = 64;
2360 2361 2362 2363 2364 2365

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2366 2367 2368
	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
		dma_address_bits = 48;
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
2369
		goto disable;
2370

2371
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2372
		result = -ENODEV;
2373
		goto disable;
K
Keith Busch 已提交
2374
	}
2375 2376

	/*
2377 2378 2379
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2380
	 */
2381 2382 2383
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2384

2385
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2386

2387
	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2388
				io_queue_depth);
2389
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
2390
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2391
	dev->dbs = dev->bar + 4096;
2392

2393 2394 2395 2396 2397 2398 2399 2400 2401
	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;
2402 2403 2404 2405 2406 2407 2408

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2409 2410
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2411
			dev->q_depth);
2412 2413
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2414
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2415 2416 2417
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2418 2419
	}

2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431
	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}


2432
	nvme_map_cmb(dev);
2433

K
Keith Busch 已提交
2434 2435
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2436 2437 2438 2439 2440 2441 2442 2443
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2444 2445 2446
{
	if (dev->bar)
		iounmap(dev->bar);
2447
	pci_release_mem_regions(to_pci_dev(dev->dev));
2448 2449 2450
}

static void nvme_pci_disable(struct nvme_dev *dev)
2451
{
2452 2453
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2454
	pci_free_irq_vectors(pdev);
2455

K
Keith Busch 已提交
2456 2457
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2458
		pci_disable_device(pdev);
K
Keith Busch 已提交
2459 2460 2461
	}
}

2462
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2463
{
2464
	bool dead = true, freeze = false;
K
Keith Busch 已提交
2465
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2466

2467
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2468 2469 2470
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2471
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
2472 2473
		    dev->ctrl.state == NVME_CTRL_RESETTING) {
			freeze = true;
K
Keith Busch 已提交
2474
			nvme_start_freeze(&dev->ctrl);
2475
		}
K
Keith Busch 已提交
2476 2477
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2478
	}
2479

K
Keith Busch 已提交
2480 2481 2482 2483
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2484 2485
	if (!dead && shutdown && freeze)
		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2486 2487

	nvme_stop_queues(&dev->ctrl);
2488

2489
	if (!dead && dev->ctrl.queue_count > 0) {
2490
		nvme_disable_io_queues(dev);
2491
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2492
	}
2493 2494
	nvme_suspend_io_queues(dev);
	nvme_suspend_queue(&dev->queues[0]);
2495
	nvme_pci_disable(dev);
2496
	nvme_reap_pending_cqes(dev);
2497

2498 2499
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
2500 2501
	blk_mq_tagset_wait_completed_request(&dev->tagset);
	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
K
Keith Busch 已提交
2502 2503 2504 2505 2506 2507

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
2508
	if (shutdown) {
K
Keith Busch 已提交
2509
		nvme_start_queues(&dev->ctrl);
2510 2511 2512
		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
	}
2513
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2514 2515
}

2516 2517 2518 2519 2520 2521 2522 2523
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
	if (!nvme_wait_reset(&dev->ctrl))
		return -EBUSY;
	nvme_dev_disable(dev, shutdown);
	return 0;
}

M
Matthew Wilcox 已提交
2524 2525
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2526
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
C
Christoph Hellwig 已提交
2527 2528
						NVME_CTRL_PAGE_SIZE,
						NVME_CTRL_PAGE_SIZE, 0);
M
Matthew Wilcox 已提交
2529 2530 2531
	if (!dev->prp_page_pool)
		return -ENOMEM;

2532
	/* Optimisation for I/Os between 4k and 128k */
2533
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2534 2535 2536 2537 2538
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2539 2540 2541 2542 2543 2544
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2545
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2546 2547
}

2548 2549 2550 2551 2552 2553 2554
static void nvme_free_tagset(struct nvme_dev *dev)
{
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	dev->ctrl.tagset = NULL;
}

2555
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2556
{
2557
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2558

2559
	nvme_dbbuf_dma_free(dev);
2560
	nvme_free_tagset(dev);
2561 2562
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2563
	free_opal_dev(dev->ctrl.opal_dev);
2564
	mempool_destroy(dev->iod_mempool);
2565 2566
	put_device(dev->dev);
	kfree(dev->queues);
2567 2568 2569
	kfree(dev);
}

2570
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2571
{
2572 2573 2574 2575 2576
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
2577
	nvme_get_ctrl(&dev->ctrl);
2578
	nvme_dev_disable(dev, false);
2579
	nvme_kill_queues(&dev->ctrl);
2580
	if (!queue_work(nvme_wq, &dev->remove_work))
2581 2582 2583
		nvme_put_ctrl(&dev->ctrl);
}

2584
static void nvme_reset_work(struct work_struct *work)
2585
{
2586 2587
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2588
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2589
	int result;
2590

2591 2592
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
		result = -ENODEV;
2593
		goto out;
2594
	}
2595

2596 2597 2598 2599
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2600
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2601
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2602
	nvme_sync_queues(&dev->ctrl);
2603

2604
	mutex_lock(&dev->shutdown_lock);
2605
	result = nvme_pci_enable(dev);
2606
	if (result)
2607
		goto out_unlock;
2608

2609
	result = nvme_pci_configure_admin_queue(dev);
2610
	if (result)
2611
		goto out_unlock;
2612

K
Keith Busch 已提交
2613 2614
	result = nvme_alloc_admin_tags(dev);
	if (result)
2615
		goto out_unlock;
2616

2617 2618 2619 2620
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
2621 2622
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
2623
	dev->ctrl.max_segments = NVME_MAX_SEGS;
2624 2625 2626 2627 2628

	/*
	 * Don't limit the IOMMU merged segment size.
	 */
	dma_set_max_seg_size(dev->dev, 0xffffffff);
J
Jianxiong Gao 已提交
2629
	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
2630

2631 2632 2633 2634 2635 2636 2637 2638 2639
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
2640
		result = -EBUSY;
2641 2642
		goto out;
	}
2643

2644 2645 2646 2647 2648 2649
	/*
	 * We do not support an SGL for metadata (yet), so we are limited to a
	 * single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;

2650
	result = nvme_init_ctrl_finish(&dev->ctrl);
2651
	if (result)
2652
		goto out;
2653

2654 2655 2656 2657 2658 2659 2660 2661 2662
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2663
	}
2664

2665 2666 2667 2668 2669 2670 2671
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2672 2673 2674 2675 2676
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2677

2678
	result = nvme_setup_io_queues(dev);
2679
	if (result)
2680
		goto out;
2681

2682 2683 2684 2685
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2686
	if (dev->online_queues < 2) {
2687
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2688
		nvme_kill_queues(&dev->ctrl);
2689
		nvme_remove_namespaces(&dev->ctrl);
2690
		nvme_free_tagset(dev);
2691
	} else {
2692
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2693
		nvme_wait_freeze(&dev->ctrl);
K
Keith Busch 已提交
2694
		nvme_dev_add(dev);
K
Keith Busch 已提交
2695
		nvme_unfreeze(&dev->ctrl);
2696 2697
	}

2698 2699 2700 2701
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
K
Keith Busch 已提交
2702
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
2703
		dev_warn(dev->ctrl.device,
K
Keith Busch 已提交
2704
			"failed to mark controller live state\n");
2705
		result = -ENODEV;
2706 2707
		goto out;
	}
2708

2709
	nvme_start_ctrl(&dev->ctrl);
2710
	return;
2711

2712 2713
 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
2714
 out:
2715 2716 2717 2718
	if (result)
		dev_warn(dev->ctrl.device,
			 "Removing after probe failure status: %d\n", result);
	nvme_remove_dead_ctrl(dev);
2719 2720
}

2721
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2722
{
2723
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2724
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2725 2726

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2727
		device_release_driver(&pdev->dev);
2728
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2729 2730
}

2731
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2732
{
2733
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2734
	return 0;
T
Tejun Heo 已提交
2735 2736
}

2737
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2738
{
2739 2740 2741
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2742

2743 2744
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
2745
	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
2746
	return 0;
2747 2748
}

2749 2750 2751 2752
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

2753
	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
2754 2755
}

2756
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2757
	.name			= "pcie",
2758
	.module			= THIS_MODULE,
2759 2760
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2761
	.reg_read32		= nvme_pci_reg_read32,
2762
	.reg_write32		= nvme_pci_reg_write32,
2763
	.reg_read64		= nvme_pci_reg_read64,
2764
	.free_ctrl		= nvme_pci_free_ctrl,
2765
	.submit_async_event	= nvme_pci_submit_async_event,
2766
	.get_address		= nvme_pci_get_address,
2767
};
2768

2769 2770 2771 2772
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2773
	if (pci_request_mem_regions(pdev, "nvme"))
2774 2775
		return -ENODEV;

2776
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2777 2778
		goto release;

M
Max Gurtovoy 已提交
2779
	return 0;
2780
  release:
M
Max Gurtovoy 已提交
2781 2782
	pci_release_mem_regions(pdev);
	return -ENODEV;
2783 2784
}

2785
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2800 2801 2802
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2803 2804 2805
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2806 2807
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2808 2809
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2810
			return NVME_QUIRK_NO_APST;
2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
		/*
		 * Forcing to use host managed nvme power settings for
		 * lowest idle power with quick resume latency on
		 * Samsung and Toshiba SSDs based on suspend behavior
		 * on Coffee Lake board for LENOVO C640
		 */
		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
			return NVME_QUIRK_SIMPLE_SUSPEND;
2823 2824 2825 2826 2827
	}

	return 0;
}

2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875
#ifdef CONFIG_ACPI
static bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
	struct acpi_device *adev;
	struct pci_dev *root;
	acpi_handle handle;
	acpi_status status;
	u8 val;

	/*
	 * Look for _DSD property specifying that the storage device on the port
	 * must use D3 to support deep platform power savings during
	 * suspend-to-idle.
	 */
	root = pcie_find_root_port(dev);
	if (!root)
		return false;

	adev = ACPI_COMPANION(&root->dev);
	if (!adev)
		return false;

	/*
	 * The property is defined in the PXSX device for South complex ports
	 * and in the PEGP device for North complex ports.
	 */
	status = acpi_get_handle(adev->handle, "PXSX", &handle);
	if (ACPI_FAILURE(status)) {
		status = acpi_get_handle(adev->handle, "PEGP", &handle);
		if (ACPI_FAILURE(status))
			return false;
	}

	if (acpi_bus_get_device(handle, &adev))
		return false;

	if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
			&val))
		return false;
	return val == 1;
}
#else
static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
	return false;
}
#endif /* CONFIG_ACPI */

2876 2877 2878
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2879

2880
	flush_work(&dev->ctrl.reset_work);
2881
	flush_work(&dev->ctrl.scan_work);
2882
	nvme_put_ctrl(&dev->ctrl);
2883 2884
}

2885
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2886
{
M
Matias Bjørling 已提交
2887
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2888
	struct nvme_dev *dev;
2889
	unsigned long quirks = id->driver_data;
2890
	size_t alloc_size;
M
Matthew Wilcox 已提交
2891

M
Matias Bjørling 已提交
2892 2893
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2894
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2895 2896

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2897 2898
	if (!dev)
		return -ENOMEM;
2899

2900 2901 2902 2903 2904
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
	dev->queues = kcalloc_node(dev->nr_allocated_queues,
			sizeof(struct nvme_queue), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2905 2906 2907
	if (!dev->queues)
		goto free;

2908
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2909
	pci_set_drvdata(pdev, dev);
2910

2911 2912
	result = nvme_dev_map(dev);
	if (result)
2913
		goto put_pci;
2914

2915
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2916
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2917
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2918

M
Matthew Wilcox 已提交
2919 2920
	result = nvme_setup_prp_pools(dev);
	if (result)
2921
		goto unmap;
2922

2923
	quirks |= check_vendor_combination_bug(pdev);
2924

2925 2926 2927 2928 2929 2930 2931 2932 2933 2934
	if (!noacpi && nvme_acpi_storage_d3(pdev)) {
		/*
		 * Some systems use a bios work around to ask for D3 on
		 * platforms that support kernel managed suspend.
		 */
		dev_info(&pdev->dev,
			 "platform quirk: setting simple suspend\n");
		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
	}

2935 2936 2937 2938
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
2939
	alloc_size = nvme_pci_iod_alloc_size();
2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2951 2952 2953 2954 2955
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2956 2957
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2958
	nvme_reset_ctrl(&dev->ctrl);
2959
	async_schedule(nvme_async_probe, dev);
2960

M
Matthew Wilcox 已提交
2961 2962
	return 0;

2963 2964
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2965
 release_pools:
M
Matthew Wilcox 已提交
2966
	nvme_release_prp_pools(dev);
2967 2968
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2969
 put_pci:
2970
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2971 2972 2973 2974 2975 2976
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2977
static void nvme_reset_prepare(struct pci_dev *pdev)
2978
{
K
Keith Busch 已提交
2979
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2980 2981 2982 2983 2984 2985 2986 2987

	/*
	 * We don't need to check the return value from waiting for the reset
	 * state as pci_dev device lock is held, making it impossible to race
	 * with ->remove().
	 */
	nvme_disable_prepare_reset(dev, false);
	nvme_sync_queues(&dev->ctrl);
2988
}
2989

2990 2991
static void nvme_reset_done(struct pci_dev *pdev)
{
2992
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2993 2994 2995

	if (!nvme_try_sched_reset(&dev->ctrl))
		flush_work(&dev->ctrl.reset_work);
2996 2997
}

2998 2999 3000
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
3001

3002
	nvme_disable_prepare_reset(dev, true);
3003 3004
}

3005 3006 3007 3008 3009
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
3010
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
3011 3012
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
3013

3014
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
3015
	pci_set_drvdata(pdev, NULL);
3016

3017
	if (!pci_device_is_present(pdev)) {
3018
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
3019
		nvme_dev_disable(dev, true);
3020
		nvme_dev_remove_admin(dev);
3021
	}
3022

3023
	flush_work(&dev->ctrl.reset_work);
3024 3025
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
3026
	nvme_dev_disable(dev, true);
3027
	nvme_release_cmb(dev);
3028
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
3029
	nvme_dev_remove_admin(dev);
3030
	nvme_free_queues(dev, 0);
K
Keith Busch 已提交
3031
	nvme_release_prp_pools(dev);
3032
	nvme_dev_unmap(dev);
3033
	nvme_uninit_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
3034 3035
}

3036
#ifdef CONFIG_PM_SLEEP
3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051
static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
}

static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
{
	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}

static int nvme_resume(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
	struct nvme_ctrl *ctrl = &ndev->ctrl;

3052
	if (ndev->last_ps == U32_MAX ||
3053
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
3054
		return nvme_try_sched_reset(&ndev->ctrl);
3055 3056 3057
	return 0;
}

3058 3059 3060 3061
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);
3062 3063 3064
	struct nvme_ctrl *ctrl = &ndev->ctrl;
	int ret = -EBUSY;

3065 3066
	ndev->last_ps = U32_MAX;

3067 3068 3069 3070 3071 3072 3073
	/*
	 * The platform does not remove power for a kernel managed suspend so
	 * use host managed nvme power settings for lowest idle power if
	 * possible. This should have quicker resume latency than a full device
	 * shutdown.  But if the firmware is involved after the suspend or the
	 * device does not support any non-default power states, shut down the
	 * device fully.
3074 3075 3076 3077 3078
	 *
	 * If ASPM is not enabled for the device, shut down the device and allow
	 * the PCI bus layer to put it into D3 in order to take the PCIe link
	 * down, so as to allow the platform to achieve its minimum low-power
	 * state (which may not be possible if the link is up).
3079 3080 3081 3082 3083
	 *
	 * If a host memory buffer is enabled, shut down the device as the NVMe
	 * specification allows the device to access the host memory buffer in
	 * host DRAM from all power states, but hosts will fail access to DRAM
	 * during S3.
3084
	 */
3085
	if (pm_suspend_via_firmware() || !ctrl->npss ||
3086
	    !pcie_aspm_enabled(pdev) ||
3087
	    ndev->nr_host_mem_descs ||
3088 3089
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);
3090 3091 3092 3093 3094

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

K
Keith Busch 已提交
3095
	if (ctrl->state != NVME_CTRL_LIVE)
3096 3097 3098 3099 3100 3101
		goto unfreeze;

	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;

3102 3103 3104 3105 3106 3107 3108
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);

3109 3110 3111 3112 3113
	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
3114 3115 3116
		/* discard the saved state */
		pci_load_saved_state(pdev, NULL);

3117 3118
		/*
		 * Clearing npss forces a controller reset on resume. The
3119
		 * correct value will be rediscovered then.
3120
		 */
3121
		ret = nvme_disable_prepare_reset(ndev, true);
3122 3123 3124 3125 3126 3127 3128 3129 3130 3131
		ctrl->npss = 0;
	}
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
}

static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3132

3133
	return nvme_disable_prepare_reset(ndev, true);
3134 3135
}

3136
static int nvme_simple_resume(struct device *dev)
3137 3138 3139 3140
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

3141
	return nvme_try_sched_reset(&ndev->ctrl);
3142 3143
}

3144
static const struct dev_pm_ops nvme_dev_pm_ops = {
3145 3146 3147 3148 3149 3150 3151 3152
	.suspend	= nvme_suspend,
	.resume		= nvme_resume,
	.freeze		= nvme_simple_suspend,
	.thaw		= nvme_simple_resume,
	.poweroff	= nvme_simple_suspend,
	.restore	= nvme_simple_resume,
};
#endif /* CONFIG_PM_SLEEP */
M
Matthew Wilcox 已提交
3153

K
Keith Busch 已提交
3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
3168 3169
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
3170
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
3171 3172
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
3173 3174
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
3175 3176 3177 3178 3179 3180 3181 3182 3183
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

3184
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
3185
	pci_restore_state(pdev);
3186
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
3187 3188 3189 3190 3191
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
3192 3193 3194
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
3195 3196
}

3197
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
3198 3199 3200
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
3201 3202
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
3203 3204
};

3205
static const struct pci_device_id nvme_id_table[] = {
3206
	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
3207
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3208
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3209
	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
3210
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3211
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3212
	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
3213
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3214
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3215
	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
3216 3217
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3218
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
3219
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3220
				NVME_QUIRK_MEDIUM_PRIO_SQ |
3221 3222
				NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3223 3224
	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3225
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
3226 3227
		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3228 3229
	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
3230
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
3231 3232
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
				NVME_QUIRK_NO_NS_DESC_LIST, },
3233 3234
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3235 3236
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3237 3238
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3239 3240 3241
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
3242
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
3243
				NVME_QUIRK_DISABLE_WRITE_ZEROES|
3244
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3245 3246
	{ PCI_DEVICE(0x1987, 0x5016),	/* Phison E16 */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3247 3248 3249
	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
C
Christoph Hellwig 已提交
3250 3251 3252 3253
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
3254 3255
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
3256 3257
	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3258 3259 3260
	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3261 3262
	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3263 3264
	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3265 3266
	{ PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3267 3268
	{ PCI_DEVICE(0x2646, 0x2262),   /* KINGSTON SKC2000 NVMe SSD */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3269 3270
	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3283 3284
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
3285
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
3286 3287
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
3288 3289
				NVME_QUIRK_128_BYTES_SQES |
				NVME_QUIRK_SHARED_TAGS },
3290 3291

	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
M
Matthew Wilcox 已提交
3292 3293 3294 3295 3296 3297 3298 3299
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
3300
	.remove		= nvme_remove,
3301
	.shutdown	= nvme_shutdown,
3302
#ifdef CONFIG_PM_SLEEP
3303 3304 3305
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
3306
#endif
3307
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
3308 3309 3310 3311 3312
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
3313 3314 3315
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3316
	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3317

3318
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
3319 3320 3321 3322 3323
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3324
	flush_workqueue(nvme_wq);
M
Matthew Wilcox 已提交
3325 3326 3327 3328
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3329
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3330 3331
module_init(nvme_init);
module_exit(nvme_exit);