pci.c 84.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
M
Matthew Wilcox 已提交
2 3
/*
 * NVM Express device driver
4
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
5 6
 */

7
#include <linux/acpi.h>
K
Keith Busch 已提交
8
#include <linux/aer.h>
9
#include <linux/async.h>
M
Matthew Wilcox 已提交
10
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
11
#include <linux/blk-mq.h>
12
#include <linux/blk-mq-pci.h>
13
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
14 15 16 17 18
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
19
#include <linux/mutex.h>
20
#include <linux/once.h>
M
Matthew Wilcox 已提交
21
#include <linux/pci.h>
22
#include <linux/suspend.h>
K
Keith Busch 已提交
23
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
24
#include <linux/types.h>
25
#include <linux/io-64-nonatomic-lo-hi.h>
26
#include <linux/sed-opal.h>
27
#include <linux/pci-p2pdma.h>
28

Y
yupeng 已提交
29
#include "trace.h"
30 31
#include "nvme.h"

32
#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
33
#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
34

C
Chaitanya Kulkarni 已提交
35
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
36

37 38 39 40 41 42 43
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

44 45 46
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

47
static bool use_cmb_sqes = true;
48
module_param(use_cmb_sqes, bool, 0444);
49 50
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

51 52 53 54
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
55

C
Chaitanya Kulkarni 已提交
56 57 58 59 60 61
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

62 63 64
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
65
	.get = param_get_uint,
66 67
};

68
static unsigned int io_queue_depth = 1024;
69 70 71
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
	unsigned int n;
	int ret;

	ret = kstrtouint(val, 10, &n);
	if (ret != 0 || n > num_possible_cpus())
		return -EINVAL;
	return param_set_uint(val, kp);
}

static const struct kernel_param_ops io_queue_count_ops = {
	.set = io_queue_count_set,
	.get = param_get_uint,
};

88
static unsigned int write_queues;
89
module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
90 91 92 93
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

94
static unsigned int poll_queues;
95
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
J
Jens Axboe 已提交
96 97
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

98 99 100 101
static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");

102 103
struct nvme_dev;
struct nvme_queue;
104

105
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
106
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
107

108 109 110 111
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
112
	struct nvme_queue *queues;
113 114 115 116 117 118 119 120
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
121
	unsigned io_queues[HCTX_MAX_TYPES];
122
	unsigned int num_vecs;
123
	u32 q_depth;
124
	int io_sqes;
125 126
	u32 db_stride;
	void __iomem *bar;
127
	unsigned long bar_mapped_size;
128
	struct work_struct remove_work;
129
	struct mutex shutdown_lock;
130 131
	bool subsystem;
	u64 cmb_size;
132
	bool cmb_use_sqes;
133
	u32 cmbsz;
134
	u32 cmbloc;
135
	struct nvme_ctrl ctrl;
136
	u32 last_ps;
137

138 139
	mempool_t *iod_mempool;

140
	/* shadow doorbell buffer support: */
141 142 143 144
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
145 146 147 148

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
149
	dma_addr_t host_mem_descs_dma;
150 151
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
152 153 154
	unsigned int nr_allocated_queues;
	unsigned int nr_write_queues;
	unsigned int nr_poll_queues;
K
Keith Busch 已提交
155
};
156

157 158
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
159
	int ret;
160
	u32 n;
161

162
	ret = kstrtou32(val, 10, &n);
163 164 165
	if (ret != 0 || n < 2)
		return -EINVAL;

166
	return param_set_uint(val, kp);
167 168
}

169 170 171 172 173 174 175 176 177 178
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

179 180 181 182 183
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
184 185 186 187 188
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
M
Matthew Wilcox 已提交
189
	struct nvme_dev *dev;
190
	spinlock_t sq_lock;
191
	void *sq_cmds;
192 193
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
K
Keith Busch 已提交
194
	struct nvme_completion *cqes;
M
Matthew Wilcox 已提交
195 196 197
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
198
	u32 q_depth;
199
	u16 cq_vector;
M
Matthew Wilcox 已提交
200
	u16 sq_tail;
201
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
202
	u16 cq_head;
K
Keith Busch 已提交
203
	u16 qid;
204
	u8 cq_phase;
205
	u8 sqes;
206 207
	unsigned long flags;
#define NVMEQ_ENABLED		0
208
#define NVMEQ_SQ_CMB		1
209
#define NVMEQ_DELETE_ERROR	2
210
#define NVMEQ_POLLED		3
211 212 213 214
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
215
	struct completion delete_done;
M
Matthew Wilcox 已提交
216 217
};

218
/*
219 220 221 222
 * The nvme_iod describes the data in an I/O.
 *
 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
 * to the actual struct scatterlist.
223 224
 */
struct nvme_iod {
225
	struct nvme_request req;
C
Christoph Hellwig 已提交
226
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
227
	bool use_sgl;
C
Christoph Hellwig 已提交
228
	int aborted;
229 230 231
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	dma_addr_t first_dma;
232
	unsigned int dma_len;	/* length of single DMA segment mapping */
233
	dma_addr_t meta_dma;
C
Christoph Hellwig 已提交
234
	struct scatterlist *sg;
M
Matthew Wilcox 已提交
235 236
};

237
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
238
{
239
	return dev->nr_allocated_queues * 8 * dev->db_stride;
240 241 242 243
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
244
	unsigned int mem_size = nvme_dbbuf_size(dev);
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
269
	unsigned int mem_size = nvme_dbbuf_size(dev);
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

295 296 297 298 299 300 301 302 303 304 305
static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return;

	nvmeq->dbbuf_sq_db = NULL;
	nvmeq->dbbuf_cq_db = NULL;
	nvmeq->dbbuf_sq_ei = NULL;
	nvmeq->dbbuf_cq_ei = NULL;
}

306 307 308
static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;
309
	unsigned int i;
310 311 312 313 314 315 316 317 318 319

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
320
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
321 322
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
323 324 325

		for (i = 1; i <= dev->online_queues; i++)
			nvme_dbbuf_free(&dev->queues[i]);
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

350 351 352 353 354 355 356 357
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

358 359 360 361 362
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
363 364
}

365 366 367 368 369
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
370
static int nvme_pci_npages_prp(void)
371
{
372
	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
373
				      NVME_CTRL_PAGE_SIZE);
374 375 376
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
377 378 379 380
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
381
static int nvme_pci_npages_sgl(void)
382
{
383 384
	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
			PAGE_SIZE);
C
Christoph Hellwig 已提交
385
}
386

387
static size_t nvme_pci_iod_alloc_size(void)
C
Christoph Hellwig 已提交
388
{
389
	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
C
Chaitanya Kulkarni 已提交
390

391 392
	return sizeof(__le64 *) * npages +
		sizeof(struct scatterlist) * NVME_MAX_SEGS;
C
Christoph Hellwig 已提交
393
}
394

M
Matias Bjørling 已提交
395 396
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
397
{
M
Matias Bjørling 已提交
398
	struct nvme_dev *dev = data;
399
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
400

401 402 403
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);

M
Matias Bjørling 已提交
404 405
	hctx->driver_data = nvmeq;
	return 0;
406 407
}

M
Matias Bjørling 已提交
408 409
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
410
{
M
Matias Bjørling 已提交
411
	struct nvme_dev *dev = data;
412
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
413

414
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
415 416
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
417 418
}

419 420
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
421
{
422
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
423
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
424
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
425
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
426 427

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
428
	iod->nvmeq = nvmeq;
429 430

	nvme_req(req)->ctrl = &dev->ctrl;
M
Matias Bjørling 已提交
431 432 433
	return 0;
}

434 435 436 437 438 439 440 441 442
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

443 444 445
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
446 447 448 449 450 451 452 453
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
454
			BUG_ON(i == HCTX_TYPE_DEFAULT);
455
			continue;
456 457
		}

J
Jens Axboe 已提交
458 459 460 461
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
462
		map->queue_offset = qoff;
463
		if (i != HCTX_TYPE_POLL && offset)
J
Jens Axboe 已提交
464 465 466
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
467 468 469 470 471
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
472 473
}

474 475 476 477
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
478
{
479 480 481 482 483 484 485 486 487
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

488 489 490
	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
491
	nvmeq->last_sq_tail = nvmeq->sq_tail;
492 493
}

M
Matthew Wilcox 已提交
494
/**
495
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
496 497
 * @nvmeq: The queue to use
 * @cmd: The command to send
498
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
499
 */
500 501
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
502
{
503
	spin_lock(&nvmeq->sq_lock);
504 505
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
506 507
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
508
	nvme_write_sq_db(nvmeq, write_sq);
509 510 511 512 513 514 515 516
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
517 518
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
519
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
520 521
}

C
Chaitanya Kulkarni 已提交
522
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
523
{
C
Christoph Hellwig 已提交
524
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
525
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
526 527
}

528 529 530
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
531
	int nseg = blk_rq_nr_phys_segments(req);
532 533
	unsigned int avg_seg_size;

534
	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
535 536 537 538 539 540 541 542 543 544

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

545
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
546
{
C
Christoph Hellwig 已提交
547
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
548
	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
C
Chaitanya Kulkarni 已提交
549
	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
550 551
	int i;

552
	if (iod->dma_len) {
553 554
		dma_unmap_page(dev->dev, dma_addr, iod->dma_len,
			       rq_dma_dir(req));
555
		return;
556 557
	}

558 559
	WARN_ON_ONCE(!iod->nents);

560 561 562 563
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
				    rq_dma_dir(req));
	else
564 565 566
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));


567
	if (iod->npages == 0)
C
Chaitanya Kulkarni 已提交
568 569 570
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			dma_addr);

571
	for (i = 0; i < iod->npages; i++) {
C
Chaitanya Kulkarni 已提交
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586
		void *addr = nvme_pci_iod_list(req)[i];

		if (iod->use_sgl) {
			struct nvme_sgl_desc *sg_list = addr;

			next_dma_addr =
			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
		} else {
			__le64 *prp_list = addr;

			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
		}

		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
		dma_addr = next_dma_addr;
587
	}
588

589
	mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
590 591
}

592 593 594 595 596 597 598 599 600 601 602 603 604 605
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
606 607
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
608
{
C
Christoph Hellwig 已提交
609
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
610
	struct dma_pool *pool;
611
	int length = blk_rq_payload_bytes(req);
612
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
613 614
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
615
	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
616
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
617
	void **list = nvme_pci_iod_list(req);
618
	dma_addr_t prp_dma;
619
	int nprps, i;
M
Matthew Wilcox 已提交
620

621
	length -= (NVME_CTRL_PAGE_SIZE - offset);
622 623
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
624
		goto done;
625
	}
M
Matthew Wilcox 已提交
626

627
	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
628
	if (dma_len) {
629
		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
630 631 632 633 634 635
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

636
	if (length <= NVME_CTRL_PAGE_SIZE) {
637
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
638
		goto done;
639 640
	}

641
	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
642 643
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
644
		iod->npages = 0;
645 646
	} else {
		pool = dev->prp_page_pool;
647
		iod->npages = 1;
648 649
	}

650
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
651
	if (!prp_list) {
652
		iod->first_dma = dma_addr;
653
		iod->npages = -1;
654
		return BLK_STS_RESOURCE;
655
	}
656 657
	list[0] = prp_list;
	iod->first_dma = prp_dma;
658 659
	i = 0;
	for (;;) {
660
		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
661
			__le64 *old_prp_list = prp_list;
662
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
663
			if (!prp_list)
664
				return BLK_STS_RESOURCE;
665
			list[iod->npages++] = prp_list;
666 667 668
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
669 670
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
671 672 673
		dma_len -= NVME_CTRL_PAGE_SIZE;
		dma_addr += NVME_CTRL_PAGE_SIZE;
		length -= NVME_CTRL_PAGE_SIZE;
674 675 676 677
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
678 679
		if (unlikely(dma_len < 0))
			goto bad_sgl;
680 681 682
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
683 684
	}

C
Chaitanya Kulkarni 已提交
685 686 687 688
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);

689 690 691
	return BLK_STS_OK;

 bad_sgl:
692 693 694
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
695
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
696 697
}

C
Chaitanya Kulkarni 已提交
698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
720
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
721 722 723 724 725 726
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
727
	int i = 0;
C
Chaitanya Kulkarni 已提交
728 729 730 731

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

732
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
				return BLK_STS_RESOURCE;

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
773
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
774 775 776 777

	return BLK_STS_OK;
}

778 779 780 781 782
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
783 784
	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
785 786 787 788 789 790 791 792 793

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
	if (bv->bv_len > first_prp_len)
		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
794
	return BLK_STS_OK;
795 796
}

797 798 799 800 801 802 803 804 805 806 807
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

808
	cmnd->flags = NVME_CMD_SGL_METABUF;
809 810 811
	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
812
	return BLK_STS_OK;
813 814
}

815
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
816
		struct nvme_command *cmnd)
817
{
C
Christoph Hellwig 已提交
818
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
819
	blk_status_t ret = BLK_STS_RESOURCE;
820
	int nr_mapped;
821

822 823 824 825
	if (blk_rq_nr_phys_segments(req) == 1) {
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
826
			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
827 828
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
829 830 831 832 833

			if (iod->nvmeq->qid &&
			    dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
				return nvme_setup_sgl_simple(dev, req,
							     &cmnd->rw, &bv);
834 835 836 837
		}
	}

	iod->dma_len = 0;
838 839 840
	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
	if (!iod->sg)
		return BLK_STS_RESOURCE;
841
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
842
	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
C
Christoph Hellwig 已提交
843 844
	if (!iod->nents)
		goto out;
845

846
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
847 848
		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
849 850
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
851
					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
852
	if (!nr_mapped)
C
Christoph Hellwig 已提交
853
		goto out;
854

855
	iod->use_sgl = nvme_pci_use_sgls(dev, req);
856
	if (iod->use_sgl)
857
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
858 859
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
860
out:
861
	if (ret != BLK_STS_OK)
862 863 864
		nvme_unmap_data(dev, req);
	return ret;
}
865

866 867 868 869
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
870

871 872 873 874 875
	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
			rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
		return BLK_STS_IOERR;
	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
876
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
877 878
}

879 880 881
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
882
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
883
			 const struct blk_mq_queue_data *bd)
884
{
M
Matias Bjørling 已提交
885 886
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
887
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
888
	struct request *req = bd->rq;
889
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
890
	struct nvme_command cmnd;
891
	blk_status_t ret;
K
Keith Busch 已提交
892

893 894 895 896
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;

897 898 899 900
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
901
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
902 903
		return BLK_STS_IOERR;

904
	ret = nvme_setup_cmd(ns, req, &cmnd);
905
	if (ret)
C
Christoph Hellwig 已提交
906
		return ret;
M
Matias Bjørling 已提交
907

908
	if (blk_rq_nr_phys_segments(req)) {
909
		ret = nvme_map_data(dev, req, &cmnd);
910
		if (ret)
911
			goto out_free_cmd;
912
	}
M
Matias Bjørling 已提交
913

914 915 916 917 918 919
	if (blk_integrity_rq(req)) {
		ret = nvme_map_metadata(dev, req, &cmnd);
		if (ret)
			goto out_unmap_data;
	}

920
	blk_mq_start_request(req);
921
	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
922
	return BLK_STS_OK;
923 924
out_unmap_data:
	nvme_unmap_data(dev, req);
925 926
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
927
	return ret;
M
Matthew Wilcox 已提交
928
}
K
Keith Busch 已提交
929

930
static void nvme_pci_complete_rq(struct request *req)
931
{
C
Christoph Hellwig 已提交
932
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
933
	struct nvme_dev *dev = iod->nvmeq->dev;
M
Matias Bjørling 已提交
934

935 936 937
	if (blk_integrity_rq(req))
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
938
	if (blk_rq_nr_phys_segments(req))
939
		nvme_unmap_data(dev, req);
940
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
941 942
}

943
/* We read the CQE phase first to check if the rest of the entry is valid */
944
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
945
{
K
Keith Busch 已提交
946 947 948
	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];

	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
949 950
}

951
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
952
{
953
	u16 head = nvmeq->cq_head;
954

955 956 957
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
958
}
959

C
Christoph Hellwig 已提交
960 961 962 963 964 965 966
static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return nvmeq->dev->admin_tagset.tags[0];
	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}

967
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
968
{
K
Keith Busch 已提交
969
	struct nvme_completion *cqe = &nvmeq->cqes[idx];
970
	struct request *req;
971

972 973 974 975 976 977
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
978
	if (unlikely(nvme_is_aen_req(nvmeq->qid, cqe->command_id))) {
979 980
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
981
		return;
982
	}
M
Matthew Wilcox 已提交
983

C
Christoph Hellwig 已提交
984
	req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
985 986 987 988 989 990 991
	if (unlikely(!req)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
			cqe->command_id, le16_to_cpu(cqe->sq_id));
		return;
	}

Y
yupeng 已提交
992
	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
993
	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
994
		nvme_pci_complete_rq(req);
995
}
M
Matthew Wilcox 已提交
996

997 998
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
999 1000 1001
	u16 tmp = nvmeq->cq_head + 1;

	if (tmp == nvmeq->q_depth) {
1002
		nvmeq->cq_head = 0;
1003
		nvmeq->cq_phase ^= 1;
1004 1005
	} else {
		nvmeq->cq_head = tmp;
M
Matthew Wilcox 已提交
1006
	}
J
Jens Axboe 已提交
1007 1008
}

1009
static inline int nvme_process_cq(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1010
{
1011
	int found = 0;
M
Matthew Wilcox 已提交
1012

1013
	while (nvme_cqe_pending(nvmeq)) {
1014
		found++;
1015 1016 1017 1018 1019
		/*
		 * load-load control dependency between phase and the rest of
		 * the cqe requires a full read memory barrier
		 */
		dma_rmb();
1020
		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
1021
		nvme_update_cq_head(nvmeq);
1022
	}
1023

1024
	if (found)
1025
		nvme_ring_cq_doorbell(nvmeq);
1026
	return found;
M
Matthew Wilcox 已提交
1027 1028 1029
}

static irqreturn_t nvme_irq(int irq, void *data)
1030 1031
{
	struct nvme_queue *nvmeq = data;
1032
	irqreturn_t ret = IRQ_NONE;
1033

1034 1035 1036 1037 1038
	/*
	 * The rmb/wmb pair ensures we see all updates from a previous run of
	 * the irq handler, even if that was on another CPU.
	 */
	rmb();
1039 1040
	if (nvme_process_cq(nvmeq))
		ret = IRQ_HANDLED;
1041
	wmb();
1042

1043
	return ret;
1044 1045 1046 1047 1048
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1049

1050
	if (nvme_cqe_pending(nvmeq))
1051 1052
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1053 1054
}

1055
/*
1056
 * Poll for completions for any interrupt driven queue
1057 1058
 * Can be called from any context.
 */
1059
static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1060
{
1061
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
J
Jens Axboe 已提交
1062

1063
	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
1064

1065 1066 1067
	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
	nvme_process_cq(nvmeq);
	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
J
Jens Axboe 已提交
1068 1069
}

1070
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1071 1072 1073 1074 1075 1076 1077
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1078
	spin_lock(&nvmeq->cq_poll_lock);
1079
	found = nvme_process_cq(nvmeq);
1080
	spin_unlock(&nvmeq->cq_poll_lock);
1081 1082 1083 1084

	return found;
}

1085
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1086
{
1087
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1088
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1089
	struct nvme_command c;
M
Matthew Wilcox 已提交
1090

M
Matias Bjørling 已提交
1091 1092
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1093
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1094
	nvme_submit_cmd(nvmeq, &c, true);
1095 1096
}

M
Matthew Wilcox 已提交
1097
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1098
{
M
Matthew Wilcox 已提交
1099 1100 1101 1102 1103 1104
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1105
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1106 1107 1108
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1109
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1110 1111
{
	struct nvme_command c;
J
Jens Axboe 已提交
1112 1113
	int flags = NVME_QUEUE_PHYS_CONTIG;

1114
	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
J
Jens Axboe 已提交
1115
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1116

1117
	/*
M
Minwoo Im 已提交
1118
	 * Note: we (ab)use the fact that the prp fields survive if no data
1119 1120
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1121 1122 1123 1124 1125 1126
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
1127
	c.create_cq.irq_vector = cpu_to_le16(vector);
M
Matthew Wilcox 已提交
1128

1129
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1130 1131 1132 1133 1134
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1135
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1136
	struct nvme_command c;
1137
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1138

1139 1140 1141 1142 1143 1144 1145 1146
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1147
	/*
M
Minwoo Im 已提交
1148
	 * Note: we (ab)use the fact that the prp fields survive if no data
1149 1150
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1151 1152 1153 1154 1155 1156 1157 1158
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1159
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1172
static void abort_endio(struct request *req, blk_status_t error)
1173
{
C
Christoph Hellwig 已提交
1174 1175
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1176

1177 1178
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1179 1180
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1181 1182
}

K
Keith Busch 已提交
1183 1184 1185 1186 1187 1188 1189
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1190 1191 1192
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1193
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1194
		return false;
1195 1196 1197
	default:
		break;
	}
K
Keith Busch 已提交
1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1226
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1227
{
C
Christoph Hellwig 已提交
1228 1229
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1230
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1231 1232
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1233 1234
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1235 1236 1237 1238 1239 1240 1241
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1242 1243 1244 1245 1246 1247
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1248
		nvme_reset_ctrl(&dev->ctrl);
1249
		return BLK_EH_DONE;
K
Keith Busch 已提交
1250
	}
K
Keith Busch 已提交
1251

K
Keith Busch 已提交
1252 1253 1254
	/*
	 * Did we miss an interrupt?
	 */
1255 1256 1257 1258 1259
	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
		nvme_poll(req->mq_hctx);
	else
		nvme_poll_irqdisable(nvmeq);

1260
	if (blk_mq_request_completed(req)) {
K
Keith Busch 已提交
1261 1262 1263
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1264
		return BLK_EH_DONE;
K
Keith Busch 已提交
1265 1266
	}

1267
	/*
1268 1269 1270
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1271
	 * shutdown, so we return BLK_EH_DONE.
1272
	 */
1273 1274
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
1275
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
1276
		fallthrough;
1277
	case NVME_CTRL_DELETING:
1278
		dev_warn_ratelimited(dev->ctrl.device,
1279 1280
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1281
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1282
		nvme_dev_disable(dev, true);
1283
		return BLK_EH_DONE;
1284 1285
	case NVME_CTRL_RESETTING:
		return BLK_EH_RESET_TIMER;
1286 1287
	default:
		break;
K
Keith Busch 已提交
1288 1289
	}

1290
	/*
B
Baolin Wang 已提交
1291 1292 1293
	 * Shutdown the controller immediately and schedule a reset if the
	 * command was already aborted once before and still hasn't been
	 * returned to the driver, or if this is the admin queue.
1294
	 */
C
Christoph Hellwig 已提交
1295
	if (!nvmeq->qid || iod->aborted) {
1296
		dev_warn(dev->ctrl.device,
1297 1298
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1299
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1300
		nvme_dev_disable(dev, false);
1301
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1302

1303
		return BLK_EH_DONE;
K
Keith Busch 已提交
1304 1305
	}

1306
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1307
		atomic_inc(&dev->ctrl.abort_limit);
1308
		return BLK_EH_RESET_TIMER;
1309
	}
1310
	iod->aborted = 1;
M
Matias Bjørling 已提交
1311

K
Keith Busch 已提交
1312 1313
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1314
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1315 1316
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1317 1318 1319
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1320 1321

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1322
			BLK_MQ_REQ_NOWAIT);
1323 1324 1325 1326 1327 1328 1329
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1330

1331 1332 1333 1334 1335 1336
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1337 1338
}

M
Matias Bjørling 已提交
1339 1340
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1341
	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
1342
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1343 1344
	if (!nvmeq->sq_cmds)
		return;
1345

1346
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1347
		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1348
				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1349
	} else {
1350
		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
1351
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1352
	}
1353 1354
}

1355
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1356 1357 1358
{
	int i;

1359 1360
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1361
		nvme_free_queue(&dev->queues[i]);
1362
	}
1363 1364
}

K
Keith Busch 已提交
1365 1366
/**
 * nvme_suspend_queue - put queue into suspended state
1367
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1368 1369
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1370
{
1371
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1372
		return 1;
1373

1374
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1375
	mb();
1376

1377
	nvmeq->dev->online_queues--;
1378
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1379
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1380 1381
	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
K
Keith Busch 已提交
1382 1383
	return 0;
}
M
Matthew Wilcox 已提交
1384

1385 1386 1387 1388 1389 1390 1391 1392
static void nvme_suspend_io_queues(struct nvme_dev *dev)
{
	int i;

	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
		nvme_suspend_queue(&dev->queues[i]);
}

1393
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1394
{
1395
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1396

1397 1398 1399
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1400
		nvme_disable_ctrl(&dev->ctrl);
1401

1402
	nvme_poll_irqdisable(nvmeq);
M
Matthew Wilcox 已提交
1403 1404
}

1405 1406
/*
 * Called only on a device that has been disabled and after all other threads
1407 1408 1409
 * that can check this device's completion queues have synced, except
 * nvme_poll(). This is the last chance for the driver to see a natural
 * completion before nvme_cancel_request() terminates all incomplete requests.
1410 1411 1412 1413 1414
 */
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
	int i;

1415 1416
	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
		spin_lock(&dev->queues[i].cq_poll_lock);
1417
		nvme_process_cq(&dev->queues[i]);
1418 1419
		spin_unlock(&dev->queues[i].cq_poll_lock);
	}
1420 1421
}

1422 1423 1424 1425
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1426
	unsigned q_size_aligned = roundup(q_depth * entry_size,
1427
					  NVME_CTRL_PAGE_SIZE);
1428 1429

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1430
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1431

1432
		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
1433
		q_depth = div_u64(mem_per_q, entry_size);
1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1448
				int qid)
1449
{
1450 1451 1452
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1453
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
1454 1455 1456 1457 1458 1459 1460 1461
		if (nvmeq->sq_cmds) {
			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
							nvmeq->sq_cmds);
			if (nvmeq->sq_dma_addr) {
				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
				return 0;
			}

1462
			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1463
		}
1464
	}
1465

1466
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
1467
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1468 1469
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1470 1471 1472
	return 0;
}

1473
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1474
{
1475
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1476

1477 1478
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1479

1480
	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
1481 1482
	nvmeq->q_depth = depth;
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
1483
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1484 1485 1486
	if (!nvmeq->cqes)
		goto free_nvmeq;

1487
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
M
Matthew Wilcox 已提交
1488 1489
		goto free_cqdma;

M
Matthew Wilcox 已提交
1490
	nvmeq->dev = dev;
1491
	spin_lock_init(&nvmeq->sq_lock);
1492
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1493
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1494
	nvmeq->cq_phase = 1;
1495
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
K
Keith Busch 已提交
1496
	nvmeq->qid = qid;
1497
	dev->ctrl.queue_count++;
1498

1499
	return 0;
M
Matthew Wilcox 已提交
1500 1501

 free_cqdma:
1502 1503
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
M
Matthew Wilcox 已提交
1504
 free_nvmeq:
1505
	return -ENOMEM;
M
Matthew Wilcox 已提交
1506 1507
}

1508
static int queue_request_irq(struct nvme_queue *nvmeq)
1509
{
1510 1511 1512 1513 1514 1515 1516 1517 1518 1519
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1520 1521
}

1522
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1523
{
1524
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1525

1526
	nvmeq->sq_tail = 0;
1527
	nvmeq->last_sq_tail = 0;
1528 1529
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1530
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1531
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
1532
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1533
	dev->online_queues++;
1534
	wmb(); /* ensure the first interrupt sees the initialization */
1535 1536
}

J
Jens Axboe 已提交
1537
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1538 1539 1540
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1541
	u16 vector = 0;
1542

1543 1544
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1545 1546 1547 1548
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1549 1550 1551
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
1552
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
J
Jens Axboe 已提交
1553

1554
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1555 1556
	if (result)
		return result;
M
Matthew Wilcox 已提交
1557 1558 1559

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1560
		return result;
1561
	if (result)
M
Matthew Wilcox 已提交
1562 1563
		goto release_cq;

1564
	nvmeq->cq_vector = vector;
1565
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1566

1567
	if (!polled) {
J
Jens Axboe 已提交
1568 1569 1570 1571
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1572

1573
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1574
	return result;
M
Matthew Wilcox 已提交
1575

1576
release_sq:
1577
	dev->online_queues--;
M
Matthew Wilcox 已提交
1578
	adapter_delete_sq(dev, qid);
1579
release_cq:
M
Matthew Wilcox 已提交
1580
	adapter_delete_cq(dev, qid);
1581
	return result;
M
Matthew Wilcox 已提交
1582 1583
}

1584
static const struct blk_mq_ops nvme_mq_admin_ops = {
1585
	.queue_rq	= nvme_queue_rq,
1586
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1587
	.init_hctx	= nvme_admin_init_hctx,
1588
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1589 1590 1591
	.timeout	= nvme_timeout,
};

1592
static const struct blk_mq_ops nvme_mq_ops = {
1593 1594 1595 1596 1597 1598 1599 1600
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1601 1602
};

1603 1604
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1605
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1606 1607 1608 1609 1610
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1611
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1612
		blk_cleanup_queue(dev->ctrl.admin_q);
1613 1614 1615 1616
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1617 1618
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1619
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1620 1621
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1622

K
Keith Busch 已提交
1623
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1624
		dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
1625
		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
1626
		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1627
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1628 1629 1630 1631
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1632
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1633

1634 1635
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1636 1637 1638
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1639
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1640
			nvme_dev_remove_admin(dev);
1641
			dev->ctrl.admin_q = NULL;
1642 1643
			return -ENODEV;
		}
K
Keith Busch 已提交
1644
	} else
1645
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1646 1647 1648 1649

	return 0;
}

1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1676
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1677
{
1678
	int result;
M
Matthew Wilcox 已提交
1679 1680 1681
	u32 aqa;
	struct nvme_queue *nvmeq;

1682 1683 1684 1685
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1686
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1687
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1688

1689 1690 1691
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1692

1693
	result = nvme_disable_ctrl(&dev->ctrl);
1694 1695
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1696

1697
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1698 1699
	if (result)
		return result;
M
Matthew Wilcox 已提交
1700

1701 1702
	dev->ctrl.numa_node = dev_to_node(dev->dev);

1703
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1704 1705 1706
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1707 1708 1709
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1710

1711
	result = nvme_enable_ctrl(&dev->ctrl);
1712
	if (result)
K
Keith Busch 已提交
1713
		return result;
M
Matias Bjørling 已提交
1714

K
Keith Busch 已提交
1715
	nvmeq->cq_vector = 0;
1716
	nvme_init_queue(nvmeq, 0);
1717
	result = queue_request_irq(nvmeq);
1718
	if (result) {
1719
		dev->online_queues--;
K
Keith Busch 已提交
1720
		return result;
1721
	}
1722

1723
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1724 1725 1726
	return result;
}

1727
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1728
{
J
Jens Axboe 已提交
1729
	unsigned i, max, rw_queues;
1730
	int ret = 0;
K
Keith Busch 已提交
1731

1732
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1733
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1734
			ret = -ENOMEM;
K
Keith Busch 已提交
1735
			break;
1736 1737
		}
	}
K
Keith Busch 已提交
1738

1739
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1740 1741 1742
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1743 1744 1745 1746
	} else {
		rw_queues = max;
	}

1747
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1748 1749 1750
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1751
		if (ret)
K
Keith Busch 已提交
1752
			break;
M
Matthew Wilcox 已提交
1753
	}
1754 1755 1756

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1757 1758
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1759 1760 1761
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1762 1763
}

1764 1765 1766 1767 1768 1769
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1770
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1771 1772 1773 1774
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1775
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1776
{
1777 1778 1779 1780 1781 1782 1783 1784 1785 1786
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1787
static void nvme_map_cmb(struct nvme_dev *dev)
1788
{
1789
	u64 size, offset;
1790 1791
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1792
	int bar;
1793

1794 1795 1796
	if (dev->cmb_size)
		return;

1797
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1798 1799
	if (!dev->cmbsz)
		return;
1800
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1801

1802 1803
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1804 1805
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1806 1807

	if (offset > bar_size)
1808
		return;
1809 1810 1811 1812 1813 1814 1815 1816 1817

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1818 1819 1820
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1821
		return;
1822 1823
	}

1824
	dev->cmb_size = size;
1825 1826 1827 1828 1829
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1830 1831 1832 1833 1834

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1835 1836 1837 1838
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1839
	if (dev->cmb_size) {
1840 1841
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1842
		dev->cmb_size = 0;
1843 1844 1845
	}
}

1846 1847
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1848
	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
1849
	u64 dma_addr = dev->host_mem_descs_dma;
1850 1851 1852 1853 1854 1855 1856
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
1857
	c.features.dword12	= cpu_to_le32(host_mem_size);
1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
1877
		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
1878

1879 1880 1881
		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
			       le64_to_cpu(desc->addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1882 1883 1884 1885
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1886 1887 1888
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1889
	dev->host_mem_descs = NULL;
1890
	dev->nr_host_mem_descs = 0;
1891 1892
}

1893 1894
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1895
{
1896
	struct nvme_host_mem_buf_desc *descs;
1897
	u32 max_entries, len;
1898
	dma_addr_t descs_dma;
1899
	int i = 0;
1900
	void **bufs;
1901
	u64 size, tmp;
1902 1903 1904 1905

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1906 1907 1908 1909

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1910 1911
	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
				   &descs_dma, GFP_KERNEL);
1912 1913 1914 1915 1916 1917 1918
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1919
	for (size = 0; size < preferred && i < max_entries; size += len) {
1920 1921
		dma_addr_t dma_addr;

1922
		len = min_t(u64, chunk_size, preferred - size);
1923 1924 1925 1926 1927 1928
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
1929
		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
1930 1931 1932
		i++;
	}

1933
	if (!size)
1934 1935 1936 1937 1938
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1939
	dev->host_mem_descs_dma = descs_dma;
1940 1941 1942 1943 1944
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
1945
		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
1946

1947 1948 1949
		dma_free_attrs(dev->dev, size, bufs[i],
			       le64_to_cpu(descs[i].addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1950 1951 1952 1953
	}

	kfree(bufs);
out_free_descs:
1954 1955
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1956 1957 1958 1959 1960
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1961 1962
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
1963 1964 1965
	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
	u64 chunk_size;
1966 1967

	/* start big and work our way down */
1968
	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

1979
static int nvme_setup_host_mem(struct nvme_dev *dev)
1980 1981 1982 1983 1984
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
1985
	int ret;
1986 1987 1988 1989 1990 1991 1992

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
1993
		return 0;
1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
2007 2008 2009
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
2010
			return 0; /* controller must work without HMB */
2011 2012 2013 2014 2015
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
2016 2017
	}

2018 2019
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2020
		nvme_free_host_mem(dev);
2021
	return ret;
K
Keith Busch 已提交
2022 2023
}

2024 2025 2026 2027 2028
/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
 */
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
2029
{
2030
	struct nvme_dev *dev = affd->priv;
2031
	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
2032 2033

	/*
B
Baolin Wang 已提交
2034
	 * If there is no interrupt available for queues, ensure that
2035 2036 2037 2038 2039 2040 2041 2042
	 * the default queue is set to 1. The affinity set size is
	 * also set to one, but the irq core ignores it for this case.
	 *
	 * If only one interrupt is available or 'write_queue' == 0, combine
	 * write and read queues.
	 *
	 * If 'write_queues' > 0, ensure it leaves room for at least one read
	 * queue.
2043
	 */
2044 2045 2046
	if (!nrirqs) {
		nrirqs = 1;
		nr_read_queues = 0;
2047
	} else if (nrirqs == 1 || !nr_write_queues) {
2048
		nr_read_queues = 0;
2049
	} else if (nr_write_queues >= nrirqs) {
2050
		nr_read_queues = 1;
2051
	} else {
2052
		nr_read_queues = nrirqs - nr_write_queues;
2053
	}
2054 2055 2056 2057 2058 2059

	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
	affd->nr_sets = nr_read_queues ? 2 : 1;
2060 2061
}

2062
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2063 2064 2065
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	struct irq_affinity affd = {
2066
		.pre_vectors	= 1,
2067 2068
		.calc_sets	= nvme_calc_irq_sets,
		.priv		= dev,
2069
	};
2070
	unsigned int irq_queues, poll_queues;
2071 2072

	/*
2073 2074
	 * Poll queues don't need interrupts, but we need at least one I/O queue
	 * left over for non-polled I/O.
2075
	 */
2076 2077
	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
2078

2079 2080 2081 2082
	/*
	 * Initialize for the single interrupt case, will be updated in
	 * nvme_calc_irq_sets().
	 */
2083 2084
	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
	dev->io_queues[HCTX_TYPE_READ] = 0;
2085

2086
	/*
2087 2088 2089
	 * We need interrupts for the admin queue and each non-polled I/O queue,
	 * but some Apple controllers require all queues to use the first
	 * vector.
2090
	 */
2091 2092 2093
	irq_queues = 1;
	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
		irq_queues += (nr_io_queues - poll_queues);
2094 2095
	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2096 2097
}

2098 2099 2100 2101 2102 2103
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}

2104 2105
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
2106 2107 2108 2109 2110 2111
	/*
	 * If tags are shared with admin queue (Apple bug), then
	 * make sure we only use one IO queue.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
		return 1;
2112 2113 2114
	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
}

2115
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2116
{
2117
	struct nvme_queue *adminq = &dev->queues[0];
2118
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2119
	unsigned int nr_io_queues;
2120
	unsigned long size;
2121
	int result;
M
Matthew Wilcox 已提交
2122

2123 2124 2125 2126 2127 2128
	/*
	 * Sample the module parameters once at reset time so that we have
	 * stable values to work with.
	 */
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
2129

2130
	nr_io_queues = dev->nr_allocated_queues - 1;
C
Christoph Hellwig 已提交
2131 2132
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2133
		return result;
C
Christoph Hellwig 已提交
2134

2135
	if (nr_io_queues == 0)
2136
		return 0;
2137 2138
	
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2139

2140
	if (dev->cmb_use_sqes) {
2141 2142 2143 2144 2145
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2146
			dev->cmb_use_sqes = false;
2147 2148
	}

2149 2150 2151 2152 2153 2154 2155 2156 2157
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2158

2159
 retry:
K
Keith Busch 已提交
2160
	/* Deregister the admin queue's interrupt */
2161
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2162

2163 2164 2165 2166
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2167
	pci_free_irq_vectors(pdev);
2168 2169

	result = nvme_setup_irqs(dev, nr_io_queues);
2170
	if (result <= 0)
2171
		return -EIO;
2172

2173
	dev->num_vecs = result;
J
Jens Axboe 已提交
2174
	result = max(result - 1, 1);
2175
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2176

2177 2178 2179 2180 2181 2182
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
2183
	result = queue_request_irq(adminq);
2184
	if (result)
K
Keith Busch 已提交
2185
		return result;
2186
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202

	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_disable_io_queues(dev);
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
M
Matthew Wilcox 已提交
2203 2204
}

2205
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2206
{
K
Keith Busch 已提交
2207
	struct nvme_queue *nvmeq = req->end_io_data;
2208

K
Keith Busch 已提交
2209
	blk_mq_free_request(req);
2210
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2211 2212
}

2213
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2214
{
K
Keith Busch 已提交
2215
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2216

2217 2218
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2219 2220

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2221 2222
}

K
Keith Busch 已提交
2223
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2224
{
K
Keith Busch 已提交
2225 2226 2227
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2228

K
Keith Busch 已提交
2229 2230 2231
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2232

2233
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
K
Keith Busch 已提交
2234 2235
	if (IS_ERR(req))
		return PTR_ERR(req);
2236

K
Keith Busch 已提交
2237 2238
	req->end_io_data = nvmeq;

2239
	init_completion(&nvmeq->delete_done);
K
Keith Busch 已提交
2240 2241 2242 2243
	blk_execute_rq_nowait(q, NULL, req, false,
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2244 2245
}

2246
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2247
{
2248
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2249
	unsigned long timeout;
K
Keith Busch 已提交
2250

K
Keith Busch 已提交
2251
 retry:
2252
	timeout = NVME_ADMIN_TIMEOUT;
2253 2254 2255 2256 2257
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2258
	}
2259 2260 2261 2262
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2263 2264 2265
				timeout);
		if (timeout == 0)
			return false;
2266 2267

		sent--;
2268 2269 2270 2271
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2272 2273
}

K
Keith Busch 已提交
2274
static void nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2275
{
2276 2277
	int ret;

2278
	if (!dev->ctrl.tagset) {
2279
		dev->tagset.ops = &nvme_mq_ops;
2280
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2281
		dev->tagset.nr_maps = 2; /* default + read */
2282 2283
		if (dev->io_queues[HCTX_TYPE_POLL])
			dev->tagset.nr_maps++;
2284
		dev->tagset.timeout = NVME_IO_TIMEOUT;
2285
		dev->tagset.numa_node = dev->ctrl.numa_node;
2286 2287
		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
						BLK_MQ_MAX_DEPTH) - 1;
2288
		dev->tagset.cmd_size = sizeof(struct nvme_iod);
2289 2290
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2291

2292 2293 2294 2295 2296 2297 2298 2299
		/*
		 * Some Apple controllers requires tags to be unique
		 * across admin and IO queue, so reserve the first 32
		 * tags of the IO queue.
		 */
		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
			dev->tagset.reserved_tags = NVME_AQ_DEPTH;

2300 2301 2302 2303
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
K
Keith Busch 已提交
2304
			return;
2305
		}
2306
		dev->ctrl.tagset = &dev->tagset;
2307 2308 2309 2310 2311
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2312
	}
2313

2314
	nvme_dbbuf_set(dev);
M
Matthew Wilcox 已提交
2315 2316
}

2317
static int nvme_pci_enable(struct nvme_dev *dev)
2318
{
2319
	int result = -ENOMEM;
2320
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2321 2322 2323 2324 2325 2326

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2327
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)))
2328
		goto disable;
2329

2330
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2331
		result = -ENODEV;
2332
		goto disable;
K
Keith Busch 已提交
2333
	}
2334 2335

	/*
2336 2337 2338
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2339
	 */
2340 2341 2342
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2343

2344
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2345

2346
	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2347
				io_queue_depth);
2348
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
2349
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2350
	dev->dbs = dev->bar + 4096;
2351

2352 2353 2354 2355 2356 2357 2358 2359 2360
	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;
2361 2362 2363 2364 2365 2366 2367

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2368 2369
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2370
			dev->q_depth);
2371 2372
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2373
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2374 2375 2376
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2377 2378
	}

2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390
	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}


2391
	nvme_map_cmb(dev);
2392

K
Keith Busch 已提交
2393 2394
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2395 2396 2397 2398 2399 2400 2401 2402
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2403 2404 2405
{
	if (dev->bar)
		iounmap(dev->bar);
2406
	pci_release_mem_regions(to_pci_dev(dev->dev));
2407 2408 2409
}

static void nvme_pci_disable(struct nvme_dev *dev)
2410
{
2411 2412
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2413
	pci_free_irq_vectors(pdev);
2414

K
Keith Busch 已提交
2415 2416
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2417
		pci_disable_device(pdev);
K
Keith Busch 已提交
2418 2419 2420
	}
}

2421
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2422
{
2423
	bool dead = true, freeze = false;
K
Keith Busch 已提交
2424
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2425

2426
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2427 2428 2429
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2430
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
2431 2432
		    dev->ctrl.state == NVME_CTRL_RESETTING) {
			freeze = true;
K
Keith Busch 已提交
2433
			nvme_start_freeze(&dev->ctrl);
2434
		}
K
Keith Busch 已提交
2435 2436
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2437
	}
2438

K
Keith Busch 已提交
2439 2440 2441 2442
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2443 2444
	if (!dead && shutdown && freeze)
		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2445 2446

	nvme_stop_queues(&dev->ctrl);
2447

2448
	if (!dead && dev->ctrl.queue_count > 0) {
2449
		nvme_disable_io_queues(dev);
2450
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2451
	}
2452 2453
	nvme_suspend_io_queues(dev);
	nvme_suspend_queue(&dev->queues[0]);
2454
	nvme_pci_disable(dev);
2455
	nvme_reap_pending_cqes(dev);
2456

2457 2458
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
2459 2460
	blk_mq_tagset_wait_completed_request(&dev->tagset);
	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
K
Keith Busch 已提交
2461 2462 2463 2464 2465 2466

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
2467
	if (shutdown) {
K
Keith Busch 已提交
2468
		nvme_start_queues(&dev->ctrl);
2469 2470 2471
		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
	}
2472
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2473 2474
}

2475 2476 2477 2478 2479 2480 2481 2482
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
	if (!nvme_wait_reset(&dev->ctrl))
		return -EBUSY;
	nvme_dev_disable(dev, shutdown);
	return 0;
}

M
Matthew Wilcox 已提交
2483 2484
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2485
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
C
Christoph Hellwig 已提交
2486 2487
						NVME_CTRL_PAGE_SIZE,
						NVME_CTRL_PAGE_SIZE, 0);
M
Matthew Wilcox 已提交
2488 2489 2490
	if (!dev->prp_page_pool)
		return -ENOMEM;

2491
	/* Optimisation for I/Os between 4k and 128k */
2492
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2493 2494 2495 2496 2497
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2498 2499 2500 2501 2502 2503
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2504
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2505 2506
}

2507 2508 2509 2510 2511 2512 2513
static void nvme_free_tagset(struct nvme_dev *dev)
{
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	dev->ctrl.tagset = NULL;
}

2514
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2515
{
2516
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2517

2518
	nvme_dbbuf_dma_free(dev);
2519
	nvme_free_tagset(dev);
2520 2521
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2522
	free_opal_dev(dev->ctrl.opal_dev);
2523
	mempool_destroy(dev->iod_mempool);
2524 2525
	put_device(dev->dev);
	kfree(dev->queues);
2526 2527 2528
	kfree(dev);
}

2529
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2530
{
2531 2532 2533 2534 2535
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
2536
	nvme_get_ctrl(&dev->ctrl);
2537
	nvme_dev_disable(dev, false);
2538
	nvme_kill_queues(&dev->ctrl);
2539
	if (!queue_work(nvme_wq, &dev->remove_work))
2540 2541 2542
		nvme_put_ctrl(&dev->ctrl);
}

2543
static void nvme_reset_work(struct work_struct *work)
2544
{
2545 2546
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2547
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2548
	int result;
2549

2550 2551
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
		result = -ENODEV;
2552
		goto out;
2553
	}
2554

2555 2556 2557 2558
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2559
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2560
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2561
	nvme_sync_queues(&dev->ctrl);
2562

2563
	mutex_lock(&dev->shutdown_lock);
2564
	result = nvme_pci_enable(dev);
2565
	if (result)
2566
		goto out_unlock;
2567

2568
	result = nvme_pci_configure_admin_queue(dev);
2569
	if (result)
2570
		goto out_unlock;
2571

K
Keith Busch 已提交
2572 2573
	result = nvme_alloc_admin_tags(dev);
	if (result)
2574
		goto out_unlock;
2575

2576 2577 2578 2579
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
2580 2581
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
2582
	dev->ctrl.max_segments = NVME_MAX_SEGS;
2583 2584 2585 2586 2587 2588

	/*
	 * Don't limit the IOMMU merged segment size.
	 */
	dma_set_max_seg_size(dev->dev, 0xffffffff);

2589 2590 2591 2592 2593 2594 2595 2596 2597
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
2598
		result = -EBUSY;
2599 2600
		goto out;
	}
2601

2602 2603 2604 2605 2606 2607
	/*
	 * We do not support an SGL for metadata (yet), so we are limited to a
	 * single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;

2608 2609
	result = nvme_init_identify(&dev->ctrl);
	if (result)
2610
		goto out;
2611

2612 2613 2614 2615 2616 2617 2618 2619 2620
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2621
	}
2622

2623 2624 2625 2626 2627 2628 2629
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2630 2631 2632 2633 2634
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2635

2636
	result = nvme_setup_io_queues(dev);
2637
	if (result)
2638
		goto out;
2639

2640 2641 2642 2643
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2644
	if (dev->online_queues < 2) {
2645
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2646
		nvme_kill_queues(&dev->ctrl);
2647
		nvme_remove_namespaces(&dev->ctrl);
2648
		nvme_free_tagset(dev);
2649
	} else {
2650
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2651
		nvme_wait_freeze(&dev->ctrl);
K
Keith Busch 已提交
2652
		nvme_dev_add(dev);
K
Keith Busch 已提交
2653
		nvme_unfreeze(&dev->ctrl);
2654 2655
	}

2656 2657 2658 2659
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
K
Keith Busch 已提交
2660
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
2661
		dev_warn(dev->ctrl.device,
K
Keith Busch 已提交
2662
			"failed to mark controller live state\n");
2663
		result = -ENODEV;
2664 2665
		goto out;
	}
2666

2667
	nvme_start_ctrl(&dev->ctrl);
2668
	return;
2669

2670 2671
 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
2672
 out:
2673 2674 2675 2676
	if (result)
		dev_warn(dev->ctrl.device,
			 "Removing after probe failure status: %d\n", result);
	nvme_remove_dead_ctrl(dev);
2677 2678
}

2679
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2680
{
2681
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2682
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2683 2684

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2685
		device_release_driver(&pdev->dev);
2686
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2687 2688
}

2689
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2690
{
2691
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2692
	return 0;
T
Tejun Heo 已提交
2693 2694
}

2695
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2696
{
2697 2698 2699
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2700

2701 2702
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
2703
	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
2704
	return 0;
2705 2706
}

2707 2708 2709 2710
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

2711
	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
2712 2713
}

2714
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2715
	.name			= "pcie",
2716
	.module			= THIS_MODULE,
2717 2718
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2719
	.reg_read32		= nvme_pci_reg_read32,
2720
	.reg_write32		= nvme_pci_reg_write32,
2721
	.reg_read64		= nvme_pci_reg_read64,
2722
	.free_ctrl		= nvme_pci_free_ctrl,
2723
	.submit_async_event	= nvme_pci_submit_async_event,
2724
	.get_address		= nvme_pci_get_address,
2725
};
2726

2727 2728 2729 2730
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2731
	if (pci_request_mem_regions(pdev, "nvme"))
2732 2733
		return -ENODEV;

2734
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2735 2736
		goto release;

M
Max Gurtovoy 已提交
2737
	return 0;
2738
  release:
M
Max Gurtovoy 已提交
2739 2740
	pci_release_mem_regions(pdev);
	return -ENODEV;
2741 2742
}

2743
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2758 2759 2760
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2761 2762 2763
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2764 2765
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2766 2767
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2768
			return NVME_QUIRK_NO_APST;
2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780
	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
		/*
		 * Forcing to use host managed nvme power settings for
		 * lowest idle power with quick resume latency on
		 * Samsung and Toshiba SSDs based on suspend behavior
		 * on Coffee Lake board for LENOVO C640
		 */
		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
			return NVME_QUIRK_SIMPLE_SUSPEND;
2781 2782 2783 2784 2785
	}

	return 0;
}

2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833
#ifdef CONFIG_ACPI
static bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
	struct acpi_device *adev;
	struct pci_dev *root;
	acpi_handle handle;
	acpi_status status;
	u8 val;

	/*
	 * Look for _DSD property specifying that the storage device on the port
	 * must use D3 to support deep platform power savings during
	 * suspend-to-idle.
	 */
	root = pcie_find_root_port(dev);
	if (!root)
		return false;

	adev = ACPI_COMPANION(&root->dev);
	if (!adev)
		return false;

	/*
	 * The property is defined in the PXSX device for South complex ports
	 * and in the PEGP device for North complex ports.
	 */
	status = acpi_get_handle(adev->handle, "PXSX", &handle);
	if (ACPI_FAILURE(status)) {
		status = acpi_get_handle(adev->handle, "PEGP", &handle);
		if (ACPI_FAILURE(status))
			return false;
	}

	if (acpi_bus_get_device(handle, &adev))
		return false;

	if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
			&val))
		return false;
	return val == 1;
}
#else
static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
	return false;
}
#endif /* CONFIG_ACPI */

2834 2835 2836
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2837

2838
	flush_work(&dev->ctrl.reset_work);
2839
	flush_work(&dev->ctrl.scan_work);
2840
	nvme_put_ctrl(&dev->ctrl);
2841 2842
}

2843
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2844
{
M
Matias Bjørling 已提交
2845
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2846
	struct nvme_dev *dev;
2847
	unsigned long quirks = id->driver_data;
2848
	size_t alloc_size;
M
Matthew Wilcox 已提交
2849

M
Matias Bjørling 已提交
2850 2851
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2852
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2853 2854

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2855 2856
	if (!dev)
		return -ENOMEM;
2857

2858 2859 2860 2861 2862
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
	dev->queues = kcalloc_node(dev->nr_allocated_queues,
			sizeof(struct nvme_queue), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2863 2864 2865
	if (!dev->queues)
		goto free;

2866
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2867
	pci_set_drvdata(pdev, dev);
2868

2869 2870
	result = nvme_dev_map(dev);
	if (result)
2871
		goto put_pci;
2872

2873
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2874
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2875
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2876

M
Matthew Wilcox 已提交
2877 2878
	result = nvme_setup_prp_pools(dev);
	if (result)
2879
		goto unmap;
2880

2881
	quirks |= check_vendor_combination_bug(pdev);
2882

2883 2884 2885 2886 2887 2888 2889 2890 2891 2892
	if (!noacpi && nvme_acpi_storage_d3(pdev)) {
		/*
		 * Some systems use a bios work around to ask for D3 on
		 * platforms that support kernel managed suspend.
		 */
		dev_info(&pdev->dev,
			 "platform quirk: setting simple suspend\n");
		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
	}

2893 2894 2895 2896
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
2897
	alloc_size = nvme_pci_iod_alloc_size();
2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2909 2910 2911 2912 2913
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2914 2915
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2916
	nvme_reset_ctrl(&dev->ctrl);
2917
	async_schedule(nvme_async_probe, dev);
2918

M
Matthew Wilcox 已提交
2919 2920
	return 0;

2921 2922
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2923
 release_pools:
M
Matthew Wilcox 已提交
2924
	nvme_release_prp_pools(dev);
2925 2926
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2927
 put_pci:
2928
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2929 2930 2931 2932 2933 2934
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2935
static void nvme_reset_prepare(struct pci_dev *pdev)
2936
{
K
Keith Busch 已提交
2937
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2938 2939 2940 2941 2942 2943 2944 2945

	/*
	 * We don't need to check the return value from waiting for the reset
	 * state as pci_dev device lock is held, making it impossible to race
	 * with ->remove().
	 */
	nvme_disable_prepare_reset(dev, false);
	nvme_sync_queues(&dev->ctrl);
2946
}
2947

2948 2949
static void nvme_reset_done(struct pci_dev *pdev)
{
2950
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2951 2952 2953

	if (!nvme_try_sched_reset(&dev->ctrl))
		flush_work(&dev->ctrl.reset_work);
2954 2955
}

2956 2957 2958
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2959

2960
	nvme_disable_prepare_reset(dev, true);
2961 2962
}

2963 2964 2965 2966 2967
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
2968
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2969 2970
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2971

2972
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
2973
	pci_set_drvdata(pdev, NULL);
2974

2975
	if (!pci_device_is_present(pdev)) {
2976
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2977
		nvme_dev_disable(dev, true);
2978
		nvme_dev_remove_admin(dev);
2979
	}
2980

2981
	flush_work(&dev->ctrl.reset_work);
2982 2983
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
2984
	nvme_dev_disable(dev, true);
2985
	nvme_release_cmb(dev);
2986
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
2987
	nvme_dev_remove_admin(dev);
2988
	nvme_free_queues(dev, 0);
K
Keith Busch 已提交
2989
	nvme_release_prp_pools(dev);
2990
	nvme_dev_unmap(dev);
2991
	nvme_uninit_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2992 2993
}

2994
#ifdef CONFIG_PM_SLEEP
2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009
static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
}

static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
{
	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}

static int nvme_resume(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
	struct nvme_ctrl *ctrl = &ndev->ctrl;

3010
	if (ndev->last_ps == U32_MAX ||
3011
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
3012
		return nvme_try_sched_reset(&ndev->ctrl);
3013 3014 3015
	return 0;
}

3016 3017 3018 3019
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);
3020 3021 3022
	struct nvme_ctrl *ctrl = &ndev->ctrl;
	int ret = -EBUSY;

3023 3024
	ndev->last_ps = U32_MAX;

3025 3026 3027 3028 3029 3030 3031
	/*
	 * The platform does not remove power for a kernel managed suspend so
	 * use host managed nvme power settings for lowest idle power if
	 * possible. This should have quicker resume latency than a full device
	 * shutdown.  But if the firmware is involved after the suspend or the
	 * device does not support any non-default power states, shut down the
	 * device fully.
3032 3033 3034 3035 3036
	 *
	 * If ASPM is not enabled for the device, shut down the device and allow
	 * the PCI bus layer to put it into D3 in order to take the PCIe link
	 * down, so as to allow the platform to achieve its minimum low-power
	 * state (which may not be possible if the link is up).
3037 3038 3039 3040 3041
	 *
	 * If a host memory buffer is enabled, shut down the device as the NVMe
	 * specification allows the device to access the host memory buffer in
	 * host DRAM from all power states, but hosts will fail access to DRAM
	 * during S3.
3042
	 */
3043
	if (pm_suspend_via_firmware() || !ctrl->npss ||
3044
	    !pcie_aspm_enabled(pdev) ||
3045
	    ndev->nr_host_mem_descs ||
3046 3047
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);
3048 3049 3050 3051 3052

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

K
Keith Busch 已提交
3053
	if (ctrl->state != NVME_CTRL_LIVE)
3054 3055 3056 3057 3058 3059
		goto unfreeze;

	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;

3060 3061 3062 3063 3064 3065 3066
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);

3067 3068 3069 3070 3071
	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
3072 3073 3074
		/* discard the saved state */
		pci_load_saved_state(pdev, NULL);

3075 3076
		/*
		 * Clearing npss forces a controller reset on resume. The
3077
		 * correct value will be rediscovered then.
3078
		 */
3079
		ret = nvme_disable_prepare_reset(ndev, true);
3080 3081 3082 3083 3084 3085 3086 3087 3088 3089
		ctrl->npss = 0;
	}
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
}

static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3090

3091
	return nvme_disable_prepare_reset(ndev, true);
3092 3093
}

3094
static int nvme_simple_resume(struct device *dev)
3095 3096 3097 3098
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

3099
	return nvme_try_sched_reset(&ndev->ctrl);
3100 3101
}

3102
static const struct dev_pm_ops nvme_dev_pm_ops = {
3103 3104 3105 3106 3107 3108 3109 3110
	.suspend	= nvme_suspend,
	.resume		= nvme_resume,
	.freeze		= nvme_simple_suspend,
	.thaw		= nvme_simple_resume,
	.poweroff	= nvme_simple_suspend,
	.restore	= nvme_simple_resume,
};
#endif /* CONFIG_PM_SLEEP */
M
Matthew Wilcox 已提交
3111

K
Keith Busch 已提交
3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
3126 3127
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
3128
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
3129 3130
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
3131 3132
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
3133 3134 3135 3136 3137 3138 3139 3140 3141
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

3142
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
3143
	pci_restore_state(pdev);
3144
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
3145 3146 3147 3148 3149
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
3150 3151 3152
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
3153 3154
}

3155
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
3156 3157 3158
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
3159 3160
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
3161 3162
};

3163
static const struct pci_device_id nvme_id_table[] = {
3164
	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
3165
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3166
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3167
	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
3168
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3169
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3170
	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
3171
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3172
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3173
	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
3174 3175
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3176
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
3177
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3178
				NVME_QUIRK_MEDIUM_PRIO_SQ |
3179 3180
				NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3181 3182
	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3183
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
3184 3185
		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3186 3187
	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
3188 3189
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3190 3191
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3192 3193
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3194 3195
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3196 3197 3198
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
3199 3200
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
C
Christoph Hellwig 已提交
3201 3202 3203 3204
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
3205 3206
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
3207 3208
	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3209 3210 3211
	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3212 3213
	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3214 3215
	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3216 3217
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
3218
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
3219 3220
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
3221 3222
				NVME_QUIRK_128_BYTES_SQES |
				NVME_QUIRK_SHARED_TAGS },
3223 3224

	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
M
Matthew Wilcox 已提交
3225 3226 3227 3228 3229 3230 3231 3232
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
3233
	.remove		= nvme_remove,
3234
	.shutdown	= nvme_shutdown,
3235
#ifdef CONFIG_PM_SLEEP
3236 3237 3238
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
3239
#endif
3240
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
3241 3242 3243 3244 3245
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
3246 3247 3248
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3249
	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3250

3251
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
3252 3253 3254 3255 3256
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3257
	flush_workqueue(nvme_wq);
M
Matthew Wilcox 已提交
3258 3259 3260 3261
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3262
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3263 3264
module_init(nvme_init);
module_exit(nvme_exit);