pci.c 88.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
M
Matthew Wilcox 已提交
2 3
/*
 * NVM Express device driver
4
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
5 6
 */

7
#include <linux/acpi.h>
K
Keith Busch 已提交
8
#include <linux/aer.h>
9
#include <linux/async.h>
M
Matthew Wilcox 已提交
10
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
11
#include <linux/blk-mq.h>
12
#include <linux/blk-mq-pci.h>
13
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
14 15 16 17 18
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
19
#include <linux/mutex.h>
20
#include <linux/once.h>
M
Matthew Wilcox 已提交
21
#include <linux/pci.h>
22
#include <linux/suspend.h>
K
Keith Busch 已提交
23
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
24
#include <linux/types.h>
25
#include <linux/io-64-nonatomic-lo-hi.h>
26
#include <linux/io-64-nonatomic-hi-lo.h>
27
#include <linux/sed-opal.h>
28
#include <linux/pci-p2pdma.h>
29

Y
yupeng 已提交
30
#include "trace.h"
31 32
#include "nvme.h"

33
#define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
34
#define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
35

C
Chaitanya Kulkarni 已提交
36
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
37

38 39 40 41 42 43 44
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

45 46 47
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

48
static bool use_cmb_sqes = true;
49
module_param(use_cmb_sqes, bool, 0444);
50 51
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

52 53 54 55
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
56

C
Chaitanya Kulkarni 已提交
57 58 59 60 61 62
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

63 64
#define NVME_PCI_MIN_QUEUE_SIZE 2
#define NVME_PCI_MAX_QUEUE_SIZE 4095
65 66 67
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
68
	.get = param_get_uint,
69 70
};

71
static unsigned int io_queue_depth = 1024;
72
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
73
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2 and < 4096");
74

75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
static int io_queue_count_set(const char *val, const struct kernel_param *kp)
{
	unsigned int n;
	int ret;

	ret = kstrtouint(val, 10, &n);
	if (ret != 0 || n > num_possible_cpus())
		return -EINVAL;
	return param_set_uint(val, kp);
}

static const struct kernel_param_ops io_queue_count_ops = {
	.set = io_queue_count_set,
	.get = param_get_uint,
};

91
static unsigned int write_queues;
92
module_param_cb(write_queues, &io_queue_count_ops, &write_queues, 0644);
93 94 95 96
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

97
static unsigned int poll_queues;
98
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
J
Jens Axboe 已提交
99 100
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

101 102 103 104
static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");

105 106
struct nvme_dev;
struct nvme_queue;
107

108
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
109
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
110

111 112 113 114
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
115
	struct nvme_queue *queues;
116 117 118 119 120 121 122 123
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
124
	unsigned io_queues[HCTX_MAX_TYPES];
125
	unsigned int num_vecs;
126
	u32 q_depth;
127
	int io_sqes;
128 129
	u32 db_stride;
	void __iomem *bar;
130
	unsigned long bar_mapped_size;
131
	struct work_struct remove_work;
132
	struct mutex shutdown_lock;
133 134
	bool subsystem;
	u64 cmb_size;
135
	bool cmb_use_sqes;
136
	u32 cmbsz;
137
	u32 cmbloc;
138
	struct nvme_ctrl ctrl;
139
	u32 last_ps;
140
	bool hmb;
141

142 143
	mempool_t *iod_mempool;

144
	/* shadow doorbell buffer support: */
145 146 147 148
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
149 150 151 152

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
153
	dma_addr_t host_mem_descs_dma;
154 155
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
156 157 158
	unsigned int nr_allocated_queues;
	unsigned int nr_write_queues;
	unsigned int nr_poll_queues;
159 160

	bool attrs_added;
K
Keith Busch 已提交
161
};
162

163 164
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
165 166
	return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
			NVME_PCI_MAX_QUEUE_SIZE);
167 168
}

169 170 171 172 173 174 175 176 177 178
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

179 180 181 182 183
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
184 185 186 187 188
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
M
Matthew Wilcox 已提交
189
	struct nvme_dev *dev;
190
	spinlock_t sq_lock;
191
	void *sq_cmds;
192 193
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
K
Keith Busch 已提交
194
	struct nvme_completion *cqes;
M
Matthew Wilcox 已提交
195 196 197
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
198
	u32 q_depth;
199
	u16 cq_vector;
M
Matthew Wilcox 已提交
200
	u16 sq_tail;
201
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
202
	u16 cq_head;
K
Keith Busch 已提交
203
	u16 qid;
204
	u8 cq_phase;
205
	u8 sqes;
206 207
	unsigned long flags;
#define NVMEQ_ENABLED		0
208
#define NVMEQ_SQ_CMB		1
209
#define NVMEQ_DELETE_ERROR	2
210
#define NVMEQ_POLLED		3
211 212 213 214
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
215
	struct completion delete_done;
M
Matthew Wilcox 已提交
216 217
};

218
/*
219 220 221 222
 * The nvme_iod describes the data in an I/O.
 *
 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
 * to the actual struct scatterlist.
223 224
 */
struct nvme_iod {
225
	struct nvme_request req;
226
	struct nvme_command cmd;
C
Christoph Hellwig 已提交
227
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
228
	bool use_sgl;
C
Christoph Hellwig 已提交
229
	int aborted;
230 231 232
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	dma_addr_t first_dma;
233
	unsigned int dma_len;	/* length of single DMA segment mapping */
234
	dma_addr_t meta_dma;
C
Christoph Hellwig 已提交
235
	struct scatterlist *sg;
M
Matthew Wilcox 已提交
236 237
};

238
static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
239
{
240
	return dev->nr_allocated_queues * 8 * dev->db_stride;
241 242 243 244
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
245
	unsigned int mem_size = nvme_dbbuf_size(dev);
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
270
	unsigned int mem_size = nvme_dbbuf_size(dev);
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

296 297 298 299 300 301 302 303 304 305 306
static void nvme_dbbuf_free(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return;

	nvmeq->dbbuf_sq_db = NULL;
	nvmeq->dbbuf_cq_db = NULL;
	nvmeq->dbbuf_sq_ei = NULL;
	nvmeq->dbbuf_cq_ei = NULL;
}

307 308
static void nvme_dbbuf_set(struct nvme_dev *dev)
{
309
	struct nvme_command c = { };
310
	unsigned int i;
311 312 313 314 315 316 317 318 319

	if (!dev->dbbuf_dbs)
		return;

	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
320
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
321 322
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
323 324 325

		for (i = 1; i <= dev->online_queues; i++)
			nvme_dbbuf_free(&dev->queues[i]);
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

350 351 352 353 354 355 356 357
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

358 359 360 361 362
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
363 364
}

365 366 367 368 369
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
370
static int nvme_pci_npages_prp(void)
371
{
372
	unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
373
				      NVME_CTRL_PAGE_SIZE);
374 375 376
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
377 378 379 380
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
381
static int nvme_pci_npages_sgl(void)
382
{
383 384
	return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
			PAGE_SIZE);
C
Christoph Hellwig 已提交
385
}
386

387
static size_t nvme_pci_iod_alloc_size(void)
C
Christoph Hellwig 已提交
388
{
389
	size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
C
Chaitanya Kulkarni 已提交
390

391 392
	return sizeof(__le64 *) * npages +
		sizeof(struct scatterlist) * NVME_MAX_SEGS;
C
Christoph Hellwig 已提交
393
}
394

M
Matias Bjørling 已提交
395 396
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
397
{
M
Matias Bjørling 已提交
398
	struct nvme_dev *dev = data;
399
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
400

401 402 403
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);

M
Matias Bjørling 已提交
404 405
	hctx->driver_data = nvmeq;
	return 0;
406 407
}

M
Matias Bjørling 已提交
408 409
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
410
{
M
Matias Bjørling 已提交
411
	struct nvme_dev *dev = data;
412
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
413

414
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
415 416
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
417 418
}

419 420
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
421
{
422
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
423
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
424
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
425
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
426 427

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
428
	iod->nvmeq = nvmeq;
429 430

	nvme_req(req)->ctrl = &dev->ctrl;
431
	nvme_req(req)->cmd = &iod->cmd;
M
Matias Bjørling 已提交
432 433 434
	return 0;
}

435 436 437 438 439 440 441 442 443
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

444 445 446
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
447 448 449 450 451 452 453 454
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
455
			BUG_ON(i == HCTX_TYPE_DEFAULT);
456
			continue;
457 458
		}

J
Jens Axboe 已提交
459 460 461 462
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
463
		map->queue_offset = qoff;
464
		if (i != HCTX_TYPE_POLL && offset)
J
Jens Axboe 已提交
465 466 467
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
468 469 470 471 472
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
473 474
}

475 476 477 478
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
479
{
480 481 482 483 484 485 486 487 488
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

489 490 491
	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
492
	nvmeq->last_sq_tail = nvmeq->sq_tail;
493 494
}

M
Matthew Wilcox 已提交
495
/**
496
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
497 498
 * @nvmeq: The queue to use
 * @cmd: The command to send
499
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
500
 */
501 502
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
503
{
504
	spin_lock(&nvmeq->sq_lock);
505 506
	memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
	       cmd, sizeof(*cmd));
507 508
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
509
	nvme_write_sq_db(nvmeq, write_sq);
510 511 512 513 514 515 516 517
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
518 519
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
520
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
521 522
}

C
Chaitanya Kulkarni 已提交
523
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
524
{
C
Christoph Hellwig 已提交
525
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
526
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
527 528
}

529 530 531
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
532
	int nseg = blk_rq_nr_phys_segments(req);
533 534
	unsigned int avg_seg_size;

535
	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
536

537
	if (!nvme_ctrl_sgl_supported(&dev->ctrl))
538 539 540 541 542 543 544 545
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

546
static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
547
{
548
	const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
549 550
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	dma_addr_t dma_addr = iod->first_dma;
551 552
	int i;

553 554 555 556 557 558
	for (i = 0; i < iod->npages; i++) {
		__le64 *prp_list = nvme_pci_iod_list(req)[i];
		dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);

		dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
		dma_addr = next_dma_addr;
559
	}
560
}
561

562 563 564 565 566 567
static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
{
	const int last_sg = SGES_PER_PAGE - 1;
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	dma_addr_t dma_addr = iod->first_dma;
	int i;
568

569 570 571
	for (i = 0; i < iod->npages; i++) {
		struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
		dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
572

573 574 575 576
		dma_pool_free(dev->prp_page_pool, sg_list, dma_addr);
		dma_addr = next_dma_addr;
	}
}
C
Chaitanya Kulkarni 已提交
577

578 579 580
static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
581

582 583 584 585 586 587
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
				    rq_dma_dir(req));
	else
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
}
C
Chaitanya Kulkarni 已提交
588

589 590 591
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
592

593 594 595 596
	if (iod->dma_len) {
		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
			       rq_dma_dir(req));
		return;
597
	}
598

599 600 601 602 603 604 605 606 607 608
	WARN_ON_ONCE(!iod->nents);

	nvme_unmap_sg(dev, req);
	if (iod->npages == 0)
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			      iod->first_dma);
	else if (iod->use_sgl)
		nvme_free_sgls(dev, req);
	else
		nvme_free_prps(dev, req);
609
	mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
610 611
}

612 613 614 615 616 617 618 619 620 621 622 623 624 625
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
626 627
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
628
{
C
Christoph Hellwig 已提交
629
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
630
	struct dma_pool *pool;
631
	int length = blk_rq_payload_bytes(req);
632
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
633 634
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
635
	int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
636
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
637
	void **list = nvme_pci_iod_list(req);
638
	dma_addr_t prp_dma;
639
	int nprps, i;
M
Matthew Wilcox 已提交
640

641
	length -= (NVME_CTRL_PAGE_SIZE - offset);
642 643
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
644
		goto done;
645
	}
M
Matthew Wilcox 已提交
646

647
	dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
648
	if (dma_len) {
649
		dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
M
Matthew Wilcox 已提交
650 651 652 653 654 655
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

656
	if (length <= NVME_CTRL_PAGE_SIZE) {
657
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
658
		goto done;
659 660
	}

661
	nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
662 663
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
664
		iod->npages = 0;
665 666
	} else {
		pool = dev->prp_page_pool;
667
		iod->npages = 1;
668 669
	}

670
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
671
	if (!prp_list) {
672
		iod->first_dma = dma_addr;
673
		iod->npages = -1;
674
		return BLK_STS_RESOURCE;
675
	}
676 677
	list[0] = prp_list;
	iod->first_dma = prp_dma;
678 679
	i = 0;
	for (;;) {
680
		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
681
			__le64 *old_prp_list = prp_list;
682
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
683
			if (!prp_list)
684
				goto free_prps;
685
			list[iod->npages++] = prp_list;
686 687 688
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
689 690
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
691 692 693
		dma_len -= NVME_CTRL_PAGE_SIZE;
		dma_addr += NVME_CTRL_PAGE_SIZE;
		length -= NVME_CTRL_PAGE_SIZE;
694 695 696 697
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
698 699
		if (unlikely(dma_len < 0))
			goto bad_sgl;
700 701 702
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
703
	}
C
Chaitanya Kulkarni 已提交
704 705 706
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
707
	return BLK_STS_OK;
708 709 710 711
free_prps:
	nvme_free_prps(dev, req);
	return BLK_STS_RESOURCE;
bad_sgl:
712 713 714
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
715
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
716 717
}

C
Chaitanya Kulkarni 已提交
718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
740
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
741 742 743 744 745 746
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
747
	int i = 0;
C
Chaitanya Kulkarni 已提交
748 749 750 751

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

752
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
783
				goto free_sgls;
C
Chaitanya Kulkarni 已提交
784 785 786 787 788 789 790 791 792

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
793
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
794 795

	return BLK_STS_OK;
796 797 798
free_sgls:
	nvme_free_sgls(dev, req);
	return BLK_STS_RESOURCE;
C
Chaitanya Kulkarni 已提交
799 800
}

801 802 803 804 805
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
806 807
	unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
	unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
808 809 810 811 812 813 814 815 816

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
	if (bv->bv_len > first_prp_len)
		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
817
	return BLK_STS_OK;
818 819
}

820 821 822 823 824 825 826 827 828 829 830
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

831
	cmnd->flags = NVME_CMD_SGL_METABUF;
832 833 834
	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
835
	return BLK_STS_OK;
836 837
}

838
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
839
		struct nvme_command *cmnd)
840
{
C
Christoph Hellwig 已提交
841
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
842
	blk_status_t ret = BLK_STS_RESOURCE;
843
	int nr_mapped;
844

845 846 847 848
	if (blk_rq_nr_phys_segments(req) == 1) {
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
849
			if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
850 851
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
852

853
			if (iod->nvmeq->qid && sgl_threshold &&
854
			    nvme_ctrl_sgl_supported(&dev->ctrl))
855 856
				return nvme_setup_sgl_simple(dev, req,
							     &cmnd->rw, &bv);
857 858 859 860
		}
	}

	iod->dma_len = 0;
861 862 863
	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
	if (!iod->sg)
		return BLK_STS_RESOURCE;
864
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
865
	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
C
Christoph Hellwig 已提交
866
	if (!iod->nents)
867
		goto out_free_sg;
868

869
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
870 871
		nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
				iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
872 873
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
874
					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
875
	if (!nr_mapped)
876
		goto out_free_sg;
877

878
	iod->use_sgl = nvme_pci_use_sgls(dev, req);
879
	if (iod->use_sgl)
880
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
881 882
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
883
	if (ret != BLK_STS_OK)
884 885 886 887 888 889 890
		goto out_unmap_sg;
	return BLK_STS_OK;

out_unmap_sg:
	nvme_unmap_sg(dev, req);
out_free_sg:
	mempool_free(iod->sg, dev->iod_mempool);
891 892
	return ret;
}
893

894 895 896 897
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
898

899 900 901 902 903
	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
			rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
		return BLK_STS_IOERR;
	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
904
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
905 906
}

907 908 909
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
910
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
911
			 const struct blk_mq_queue_data *bd)
912
{
M
Matias Bjørling 已提交
913 914
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
915
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
916
	struct request *req = bd->rq;
917
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
918
	struct nvme_command *cmnd = &iod->cmd;
919
	blk_status_t ret;
K
Keith Busch 已提交
920

921 922 923 924
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;

925 926 927 928
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
929
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
930 931
		return BLK_STS_IOERR;

932 933 934
	if (!nvme_check_ready(&dev->ctrl, req, true))
		return nvme_fail_nonready_command(&dev->ctrl, req);

935
	ret = nvme_setup_cmd(ns, req);
936
	if (ret)
C
Christoph Hellwig 已提交
937
		return ret;
M
Matias Bjørling 已提交
938

939
	if (blk_rq_nr_phys_segments(req)) {
940
		ret = nvme_map_data(dev, req, cmnd);
941
		if (ret)
942
			goto out_free_cmd;
943
	}
M
Matias Bjørling 已提交
944

945
	if (blk_integrity_rq(req)) {
946
		ret = nvme_map_metadata(dev, req, cmnd);
947 948 949 950
		if (ret)
			goto out_unmap_data;
	}

951
	blk_mq_start_request(req);
952
	nvme_submit_cmd(nvmeq, cmnd, bd->last);
953
	return BLK_STS_OK;
954 955
out_unmap_data:
	nvme_unmap_data(dev, req);
956 957
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
958
	return ret;
M
Matthew Wilcox 已提交
959
}
K
Keith Busch 已提交
960

961
static void nvme_pci_complete_rq(struct request *req)
962
{
C
Christoph Hellwig 已提交
963
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
964
	struct nvme_dev *dev = iod->nvmeq->dev;
M
Matias Bjørling 已提交
965

966 967 968
	if (blk_integrity_rq(req))
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
969
	if (blk_rq_nr_phys_segments(req))
970
		nvme_unmap_data(dev, req);
971
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
972 973
}

974
/* We read the CQE phase first to check if the rest of the entry is valid */
975
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
976
{
K
Keith Busch 已提交
977 978 979
	struct nvme_completion *hcqe = &nvmeq->cqes[nvmeq->cq_head];

	return (le16_to_cpu(READ_ONCE(hcqe->status)) & 1) == nvmeq->cq_phase;
980 981
}

982
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
983
{
984
	u16 head = nvmeq->cq_head;
985

986 987 988
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
989
}
990

C
Christoph Hellwig 已提交
991 992 993 994 995 996 997
static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
{
	if (!nvmeq->qid)
		return nvmeq->dev->admin_tagset.tags[0];
	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
}

998
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
999
{
K
Keith Busch 已提交
1000
	struct nvme_completion *cqe = &nvmeq->cqes[idx];
1001
	__u16 command_id = READ_ONCE(cqe->command_id);
1002
	struct request *req;
1003

1004 1005 1006 1007 1008 1009
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
1010
	if (unlikely(nvme_is_aen_req(nvmeq->qid, command_id))) {
1011 1012
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
1013
		return;
1014
	}
M
Matthew Wilcox 已提交
1015

1016
	req = nvme_find_rq(nvme_queue_tagset(nvmeq), command_id);
1017 1018 1019
	if (unlikely(!req)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
1020
			command_id, le16_to_cpu(cqe->sq_id));
1021 1022 1023
		return;
	}

Y
yupeng 已提交
1024
	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
1025
	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
1026
		nvme_pci_complete_rq(req);
1027
}
M
Matthew Wilcox 已提交
1028

1029 1030
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
1031
	u32 tmp = nvmeq->cq_head + 1;
1032 1033

	if (tmp == nvmeq->q_depth) {
1034
		nvmeq->cq_head = 0;
1035
		nvmeq->cq_phase ^= 1;
1036 1037
	} else {
		nvmeq->cq_head = tmp;
M
Matthew Wilcox 已提交
1038
	}
J
Jens Axboe 已提交
1039 1040
}

1041
static inline int nvme_process_cq(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1042
{
1043
	int found = 0;
M
Matthew Wilcox 已提交
1044

1045
	while (nvme_cqe_pending(nvmeq)) {
1046
		found++;
1047 1048 1049 1050 1051
		/*
		 * load-load control dependency between phase and the rest of
		 * the cqe requires a full read memory barrier
		 */
		dma_rmb();
1052
		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
1053
		nvme_update_cq_head(nvmeq);
1054
	}
1055

1056
	if (found)
1057
		nvme_ring_cq_doorbell(nvmeq);
1058
	return found;
M
Matthew Wilcox 已提交
1059 1060 1061
}

static irqreturn_t nvme_irq(int irq, void *data)
1062 1063
{
	struct nvme_queue *nvmeq = data;
1064

1065
	if (nvme_process_cq(nvmeq))
1066 1067
		return IRQ_HANDLED;
	return IRQ_NONE;
1068 1069 1070 1071 1072
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1073

1074
	if (nvme_cqe_pending(nvmeq))
1075 1076
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1077 1078
}

1079
/*
1080
 * Poll for completions for any interrupt driven queue
1081 1082
 * Can be called from any context.
 */
1083
static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
J
Jens Axboe 已提交
1084
{
1085
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
J
Jens Axboe 已提交
1086

1087
	WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags));
1088

1089 1090 1091
	disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
	nvme_process_cq(nvmeq);
	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
J
Jens Axboe 已提交
1092 1093
}

1094
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1095 1096 1097 1098 1099 1100 1101
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1102
	spin_lock(&nvmeq->cq_poll_lock);
1103
	found = nvme_process_cq(nvmeq);
1104
	spin_unlock(&nvmeq->cq_poll_lock);
1105 1106 1107 1108

	return found;
}

1109
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1110
{
1111
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1112
	struct nvme_queue *nvmeq = &dev->queues[0];
1113
	struct nvme_command c = { };
M
Matthew Wilcox 已提交
1114

M
Matias Bjørling 已提交
1115
	c.common.opcode = nvme_admin_async_event;
1116
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1117
	nvme_submit_cmd(nvmeq, &c, true);
1118 1119
}

M
Matthew Wilcox 已提交
1120
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1121
{
1122
	struct nvme_command c = { };
M
Matthew Wilcox 已提交
1123 1124 1125 1126

	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1127
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1128 1129 1130
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1131
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1132
{
1133
	struct nvme_command c = { };
J
Jens Axboe 已提交
1134 1135
	int flags = NVME_QUEUE_PHYS_CONTIG;

1136
	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
J
Jens Axboe 已提交
1137
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1138

1139
	/*
M
Minwoo Im 已提交
1140
	 * Note: we (ab)use the fact that the prp fields survive if no data
1141 1142
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1143 1144 1145 1146 1147
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
1148
	c.create_cq.irq_vector = cpu_to_le16(vector);
M
Matthew Wilcox 已提交
1149

1150
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1151 1152 1153 1154 1155
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1156
	struct nvme_ctrl *ctrl = &dev->ctrl;
1157
	struct nvme_command c = { };
1158
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1159

1160 1161 1162 1163 1164 1165 1166 1167
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1168
	/*
M
Minwoo Im 已提交
1169
	 * Note: we (ab)use the fact that the prp fields survive if no data
1170 1171
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1172 1173 1174 1175 1176 1177 1178
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1179
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1192
static void abort_endio(struct request *req, blk_status_t error)
1193
{
C
Christoph Hellwig 已提交
1194 1195
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1196

1197 1198
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1199 1200
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1201 1202
}

K
Keith Busch 已提交
1203 1204 1205 1206 1207 1208 1209
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{
	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1210 1211 1212
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1213
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1214
		return false;
1215 1216 1217
	default:
		break;
	}
K
Keith Busch 已提交
1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1246
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1247
{
C
Christoph Hellwig 已提交
1248 1249
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1250
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1251
	struct request *abort_req;
1252
	struct nvme_command cmd = { };
K
Keith Busch 已提交
1253 1254
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1255 1256 1257 1258 1259 1260 1261
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1262 1263 1264 1265 1266 1267
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1268
		nvme_reset_ctrl(&dev->ctrl);
1269
		return BLK_EH_DONE;
K
Keith Busch 已提交
1270
	}
K
Keith Busch 已提交
1271

K
Keith Busch 已提交
1272 1273 1274
	/*
	 * Did we miss an interrupt?
	 */
1275 1276 1277 1278 1279
	if (test_bit(NVMEQ_POLLED, &nvmeq->flags))
		nvme_poll(req->mq_hctx);
	else
		nvme_poll_irqdisable(nvmeq);

1280
	if (blk_mq_request_completed(req)) {
K
Keith Busch 已提交
1281 1282 1283
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1284
		return BLK_EH_DONE;
K
Keith Busch 已提交
1285 1286
	}

1287
	/*
1288 1289 1290
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1291
	 * shutdown, so we return BLK_EH_DONE.
1292
	 */
1293 1294
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
1295
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
1296
		fallthrough;
1297
	case NVME_CTRL_DELETING:
1298
		dev_warn_ratelimited(dev->ctrl.device,
1299 1300
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1301
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1302
		nvme_dev_disable(dev, true);
1303
		return BLK_EH_DONE;
1304 1305
	case NVME_CTRL_RESETTING:
		return BLK_EH_RESET_TIMER;
1306 1307
	default:
		break;
K
Keith Busch 已提交
1308 1309
	}

1310
	/*
B
Baolin Wang 已提交
1311 1312 1313
	 * Shutdown the controller immediately and schedule a reset if the
	 * command was already aborted once before and still hasn't been
	 * returned to the driver, or if this is the admin queue.
1314
	 */
C
Christoph Hellwig 已提交
1315
	if (!nvmeq->qid || iod->aborted) {
1316
		dev_warn(dev->ctrl.device,
1317 1318
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1319
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1320
		nvme_dev_disable(dev, false);
1321
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1322

1323
		return BLK_EH_DONE;
K
Keith Busch 已提交
1324 1325
	}

1326
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1327
		atomic_inc(&dev->ctrl.abort_limit);
1328
		return BLK_EH_RESET_TIMER;
1329
	}
1330
	iod->aborted = 1;
M
Matias Bjørling 已提交
1331

K
Keith Busch 已提交
1332
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1333
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1334 1335
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1336 1337 1338
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1339 1340

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1341
			BLK_MQ_REQ_NOWAIT);
1342 1343 1344 1345 1346 1347
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->end_io_data = NULL;
1348
	blk_execute_rq_nowait(NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1349

1350 1351 1352 1353 1354 1355
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1356 1357
}

M
Matias Bjørling 已提交
1358 1359
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1360
	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
1361
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1362 1363
	if (!nvmeq->sq_cmds)
		return;
1364

1365
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1366
		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1367
				nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1368
	} else {
1369
		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq),
1370
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1371
	}
1372 1373
}

1374
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1375 1376 1377
{
	int i;

1378 1379
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1380
		nvme_free_queue(&dev->queues[i]);
1381
	}
1382 1383
}

K
Keith Busch 已提交
1384 1385
/**
 * nvme_suspend_queue - put queue into suspended state
1386
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1387 1388
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1389
{
1390
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1391
		return 1;
1392

1393
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1394
	mb();
1395

1396
	nvmeq->dev->online_queues--;
1397
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1398
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1399 1400
	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
K
Keith Busch 已提交
1401 1402
	return 0;
}
M
Matthew Wilcox 已提交
1403

1404 1405 1406 1407 1408 1409 1410 1411
static void nvme_suspend_io_queues(struct nvme_dev *dev)
{
	int i;

	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
		nvme_suspend_queue(&dev->queues[i]);
}

1412
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1413
{
1414
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1415

1416 1417 1418
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1419
		nvme_disable_ctrl(&dev->ctrl);
1420

1421
	nvme_poll_irqdisable(nvmeq);
M
Matthew Wilcox 已提交
1422 1423
}

1424 1425
/*
 * Called only on a device that has been disabled and after all other threads
1426 1427 1428
 * that can check this device's completion queues have synced, except
 * nvme_poll(). This is the last chance for the driver to see a natural
 * completion before nvme_cancel_request() terminates all incomplete requests.
1429 1430 1431 1432 1433
 */
static void nvme_reap_pending_cqes(struct nvme_dev *dev)
{
	int i;

1434 1435
	for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
		spin_lock(&dev->queues[i].cq_poll_lock);
1436
		nvme_process_cq(&dev->queues[i]);
1437 1438
		spin_unlock(&dev->queues[i].cq_poll_lock);
	}
1439 1440
}

1441 1442 1443 1444
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1445
	unsigned q_size_aligned = roundup(q_depth * entry_size,
1446
					  NVME_CTRL_PAGE_SIZE);
1447 1448

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1449
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1450

1451
		mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
1452
		q_depth = div_u64(mem_per_q, entry_size);
1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
1467
				int qid)
1468
{
1469 1470 1471
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
1472
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq));
1473 1474 1475 1476 1477 1478 1479 1480
		if (nvmeq->sq_cmds) {
			nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
							nvmeq->sq_cmds);
			if (nvmeq->sq_dma_addr) {
				set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
				return 0;
			}

1481
			pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq));
1482
		}
1483
	}
1484

1485
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq),
1486
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1487 1488
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1489 1490 1491
	return 0;
}

1492
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1493
{
1494
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1495

1496 1497
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1498

1499
	nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES;
1500 1501
	nvmeq->q_depth = depth;
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
1502
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1503 1504 1505
	if (!nvmeq->cqes)
		goto free_nvmeq;

1506
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid))
M
Matthew Wilcox 已提交
1507 1508
		goto free_cqdma;

M
Matthew Wilcox 已提交
1509
	nvmeq->dev = dev;
1510
	spin_lock_init(&nvmeq->sq_lock);
1511
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1512
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1513
	nvmeq->cq_phase = 1;
1514
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
K
Keith Busch 已提交
1515
	nvmeq->qid = qid;
1516
	dev->ctrl.queue_count++;
1517

1518
	return 0;
M
Matthew Wilcox 已提交
1519 1520

 free_cqdma:
1521 1522
	dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes,
			  nvmeq->cq_dma_addr);
M
Matthew Wilcox 已提交
1523
 free_nvmeq:
1524
	return -ENOMEM;
M
Matthew Wilcox 已提交
1525 1526
}

1527
static int queue_request_irq(struct nvme_queue *nvmeq)
1528
{
1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1539 1540
}

1541
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1542
{
1543
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1544

1545
	nvmeq->sq_tail = 0;
1546
	nvmeq->last_sq_tail = 0;
1547 1548
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1549
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1550
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq));
1551
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1552
	dev->online_queues++;
1553
	wmb(); /* ensure the first interrupt sees the initialization */
1554 1555
}

1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577
/*
 * Try getting shutdown_lock while setting up IO queues.
 */
static int nvme_setup_io_queues_trylock(struct nvme_dev *dev)
{
	/*
	 * Give up if the lock is being held by nvme_dev_disable.
	 */
	if (!mutex_trylock(&dev->shutdown_lock))
		return -ENODEV;

	/*
	 * Controller is in wrong state, fail early.
	 */
	if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
		mutex_unlock(&dev->shutdown_lock);
		return -ENODEV;
	}

	return 0;
}

J
Jens Axboe 已提交
1578
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1579 1580 1581
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1582
	u16 vector = 0;
1583

1584 1585
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1586 1587 1588 1589
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1590 1591 1592
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
1593
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
J
Jens Axboe 已提交
1594

1595
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1596 1597
	if (result)
		return result;
M
Matthew Wilcox 已提交
1598 1599 1600

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1601
		return result;
1602
	if (result)
M
Matthew Wilcox 已提交
1603 1604
		goto release_cq;

1605
	nvmeq->cq_vector = vector;
J
Jens Axboe 已提交
1606

1607 1608 1609 1610
	result = nvme_setup_io_queues_trylock(dev);
	if (result)
		return result;
	nvme_init_queue(nvmeq, qid);
1611
	if (!polled) {
J
Jens Axboe 已提交
1612 1613 1614 1615
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1616

1617
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1618
	mutex_unlock(&dev->shutdown_lock);
1619
	return result;
M
Matthew Wilcox 已提交
1620

1621
release_sq:
1622
	dev->online_queues--;
1623
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
1624
	adapter_delete_sq(dev, qid);
1625
release_cq:
M
Matthew Wilcox 已提交
1626
	adapter_delete_cq(dev, qid);
1627
	return result;
M
Matthew Wilcox 已提交
1628 1629
}

1630
static const struct blk_mq_ops nvme_mq_admin_ops = {
1631
	.queue_rq	= nvme_queue_rq,
1632
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1633
	.init_hctx	= nvme_admin_init_hctx,
1634
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1635 1636 1637
	.timeout	= nvme_timeout,
};

1638
static const struct blk_mq_ops nvme_mq_ops = {
1639 1640 1641 1642 1643 1644 1645 1646
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1647 1648
};

1649 1650
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1651
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1652 1653 1654 1655 1656
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1657
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1658
		blk_cleanup_queue(dev->ctrl.admin_q);
1659 1660 1661 1662
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1663 1664
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1665
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1666 1667
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1668

K
Keith Busch 已提交
1669
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1670
		dev->admin_tagset.timeout = NVME_ADMIN_TIMEOUT;
1671
		dev->admin_tagset.numa_node = dev->ctrl.numa_node;
1672
		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1673
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1674 1675 1676 1677
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1678
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1679

1680 1681
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1682 1683 1684
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1685
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1686
			nvme_dev_remove_admin(dev);
1687
			dev->ctrl.admin_q = NULL;
1688 1689
			return -ENODEV;
		}
K
Keith Busch 已提交
1690
	} else
1691
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1692 1693 1694 1695

	return 0;
}

1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1722
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1723
{
1724
	int result;
M
Matthew Wilcox 已提交
1725 1726 1727
	u32 aqa;
	struct nvme_queue *nvmeq;

1728 1729 1730 1731
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1732
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1733
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1734

1735 1736 1737
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1738

1739
	result = nvme_disable_ctrl(&dev->ctrl);
1740 1741
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1742

1743
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1744 1745
	if (result)
		return result;
M
Matthew Wilcox 已提交
1746

1747 1748
	dev->ctrl.numa_node = dev_to_node(dev->dev);

1749
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1750 1751 1752
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1753 1754 1755
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1756

1757
	result = nvme_enable_ctrl(&dev->ctrl);
1758
	if (result)
K
Keith Busch 已提交
1759
		return result;
M
Matias Bjørling 已提交
1760

K
Keith Busch 已提交
1761
	nvmeq->cq_vector = 0;
1762
	nvme_init_queue(nvmeq, 0);
1763
	result = queue_request_irq(nvmeq);
1764
	if (result) {
1765
		dev->online_queues--;
K
Keith Busch 已提交
1766
		return result;
1767
	}
1768

1769
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1770 1771 1772
	return result;
}

1773
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1774
{
J
Jens Axboe 已提交
1775
	unsigned i, max, rw_queues;
1776
	int ret = 0;
K
Keith Busch 已提交
1777

1778
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1779
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1780
			ret = -ENOMEM;
K
Keith Busch 已提交
1781
			break;
1782 1783
		}
	}
K
Keith Busch 已提交
1784

1785
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1786 1787 1788
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1789 1790 1791 1792
	} else {
		rw_queues = max;
	}

1793
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1794 1795 1796
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1797
		if (ret)
K
Keith Busch 已提交
1798
			break;
M
Matthew Wilcox 已提交
1799
	}
1800 1801 1802

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1803 1804
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1805 1806 1807
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1808 1809
}

1810
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1811
{
1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1822
static void nvme_map_cmb(struct nvme_dev *dev)
1823
{
1824
	u64 size, offset;
1825 1826
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1827
	int bar;
1828

1829 1830 1831
	if (dev->cmb_size)
		return;

1832 1833 1834
	if (NVME_CAP_CMBS(dev->ctrl.cap))
		writel(NVME_CMBMSC_CRE, dev->bar + NVME_REG_CMBMSC);

1835
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1836 1837
	if (!dev->cmbsz)
		return;
1838
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1839

1840 1841
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1842 1843
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1844 1845

	if (offset > bar_size)
1846
		return;
1847

1848 1849 1850 1851 1852 1853 1854 1855 1856 1857
	/*
	 * Tell the controller about the host side address mapping the CMB,
	 * and enable CMB decoding for the NVMe 1.4+ scheme:
	 */
	if (NVME_CAP_CMBS(dev->ctrl.cap)) {
		hi_lo_writeq(NVME_CMBMSC_CRE | NVME_CMBMSC_CMSE |
			     (pci_bus_address(pdev, bar) + offset),
			     dev->bar + NVME_REG_CMBMSC);
	}

1858 1859 1860 1861 1862 1863 1864 1865
	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1866 1867 1868
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1869
		return;
1870 1871
	}

1872
	dev->cmb_size = size;
1873 1874 1875 1876 1877
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1878 1879
}

1880 1881
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1882
	u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
1883
	u64 dma_addr = dev->host_mem_descs_dma;
1884
	struct nvme_command c = { };
1885 1886 1887 1888 1889
	int ret;

	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
1890
	c.features.dword12	= cpu_to_le32(host_mem_size);
1891 1892 1893 1894 1895 1896 1897 1898 1899
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
1900 1901 1902
	} else
		dev->hmb = bits & NVME_HOST_MEM_ENABLE;

1903 1904 1905 1906 1907 1908 1909 1910 1911
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
1912
		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
1913

1914 1915 1916
		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
			       le64_to_cpu(desc->addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1917 1918 1919 1920
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1921 1922 1923
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1924
	dev->host_mem_descs = NULL;
1925
	dev->nr_host_mem_descs = 0;
1926 1927
}

1928 1929
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1930
{
1931
	struct nvme_host_mem_buf_desc *descs;
1932
	u32 max_entries, len;
1933
	dma_addr_t descs_dma;
1934
	int i = 0;
1935
	void **bufs;
1936
	u64 size, tmp;
1937 1938 1939 1940

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1941 1942 1943 1944

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1945 1946
	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
				   &descs_dma, GFP_KERNEL);
1947 1948 1949 1950 1951 1952 1953
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1954
	for (size = 0; size < preferred && i < max_entries; size += len) {
1955 1956
		dma_addr_t dma_addr;

1957
		len = min_t(u64, chunk_size, preferred - size);
1958 1959 1960 1961 1962 1963
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
1964
		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
1965 1966 1967
		i++;
	}

1968
	if (!size)
1969 1970 1971 1972 1973
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1974
	dev->host_mem_descs_dma = descs_dma;
1975 1976 1977 1978 1979
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
1980
		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
1981

1982 1983 1984
		dma_free_attrs(dev->dev, size, bufs[i],
			       le64_to_cpu(descs[i].addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1985 1986 1987 1988
	}

	kfree(bufs);
out_free_descs:
1989 1990
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1991 1992 1993 1994 1995
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1996 1997
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
1998 1999 2000
	u64 min_chunk = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
	u64 hmminds = max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
	u64 chunk_size;
2001 2002

	/* start big and work our way down */
2003
	for (chunk_size = min_chunk; chunk_size >= hmminds; chunk_size /= 2) {
2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

2014
static int nvme_setup_host_mem(struct nvme_dev *dev)
2015 2016 2017 2018 2019
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
2020
	int ret;
2021 2022 2023 2024 2025 2026 2027

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
2028
		return 0;
2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
2042 2043 2044
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
2045
			return 0; /* controller must work without HMB */
2046 2047 2048 2049 2050
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
2051 2052
	}

2053 2054
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2055
		nvme_free_host_mem(dev);
2056
	return ret;
K
Keith Busch 已提交
2057 2058
}

2059 2060 2061 2062 2063 2064 2065 2066 2067 2068
static ssize_t cmb_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "cmbloc : x%08x\ncmbsz  : x%08x\n",
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmb);

2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086
static ssize_t cmbloc_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "%u\n", ndev->cmbloc);
}
static DEVICE_ATTR_RO(cmbloc);

static ssize_t cmbsz_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "%u\n", ndev->cmbsz);
}
static DEVICE_ATTR_RO(cmbsz);

2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122
static ssize_t hmb_show(struct device *dev, struct device_attribute *attr,
			char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

	return sysfs_emit(buf, "%d\n", ndev->hmb);
}

static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
			 const char *buf, size_t count)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));
	bool new;
	int ret;

	if (strtobool(buf, &new) < 0)
		return -EINVAL;

	if (new == ndev->hmb)
		return count;

	if (new) {
		ret = nvme_setup_host_mem(ndev);
	} else {
		ret = nvme_set_host_mem(ndev, 0);
		if (!ret)
			nvme_free_host_mem(ndev);
	}

	if (ret < 0)
		return ret;

	return count;
}
static DEVICE_ATTR_RW(hmb);

2123 2124 2125 2126 2127 2128 2129
static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj,
		struct attribute *a, int n)
{
	struct nvme_ctrl *ctrl =
		dev_get_drvdata(container_of(kobj, struct device, kobj));
	struct nvme_dev *dev = to_nvme_dev(ctrl);

2130 2131 2132 2133 2134 2135
	if (a == &dev_attr_cmb.attr ||
	    a == &dev_attr_cmbloc.attr ||
	    a == &dev_attr_cmbsz.attr) {
	    	if (!dev->cmbsz)
			return 0;
	}
2136 2137 2138
	if (a == &dev_attr_hmb.attr && !ctrl->hmpre)
		return 0;

2139 2140 2141 2142 2143
	return a->mode;
}

static struct attribute *nvme_pci_attrs[] = {
	&dev_attr_cmb.attr,
2144 2145
	&dev_attr_cmbloc.attr,
	&dev_attr_cmbsz.attr,
2146
	&dev_attr_hmb.attr,
2147 2148 2149 2150 2151 2152 2153 2154
	NULL,
};

static const struct attribute_group nvme_pci_attr_group = {
	.attrs		= nvme_pci_attrs,
	.is_visible	= nvme_pci_attrs_are_visible,
};

2155 2156 2157 2158 2159
/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
 */
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
2160
{
2161
	struct nvme_dev *dev = affd->priv;
2162
	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
2163 2164

	/*
B
Baolin Wang 已提交
2165
	 * If there is no interrupt available for queues, ensure that
2166 2167 2168 2169 2170 2171 2172 2173
	 * the default queue is set to 1. The affinity set size is
	 * also set to one, but the irq core ignores it for this case.
	 *
	 * If only one interrupt is available or 'write_queue' == 0, combine
	 * write and read queues.
	 *
	 * If 'write_queues' > 0, ensure it leaves room for at least one read
	 * queue.
2174
	 */
2175 2176 2177
	if (!nrirqs) {
		nrirqs = 1;
		nr_read_queues = 0;
2178
	} else if (nrirqs == 1 || !nr_write_queues) {
2179
		nr_read_queues = 0;
2180
	} else if (nr_write_queues >= nrirqs) {
2181
		nr_read_queues = 1;
2182
	} else {
2183
		nr_read_queues = nrirqs - nr_write_queues;
2184
	}
2185 2186 2187 2188 2189 2190

	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
	affd->nr_sets = nr_read_queues ? 2 : 1;
2191 2192
}

2193
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2194 2195 2196
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	struct irq_affinity affd = {
2197
		.pre_vectors	= 1,
2198 2199
		.calc_sets	= nvme_calc_irq_sets,
		.priv		= dev,
2200
	};
2201
	unsigned int irq_queues, poll_queues;
2202 2203

	/*
2204 2205
	 * Poll queues don't need interrupts, but we need at least one I/O queue
	 * left over for non-polled I/O.
2206
	 */
2207 2208
	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
2209

2210 2211 2212 2213
	/*
	 * Initialize for the single interrupt case, will be updated in
	 * nvme_calc_irq_sets().
	 */
2214 2215
	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
	dev->io_queues[HCTX_TYPE_READ] = 0;
2216

2217
	/*
2218 2219 2220
	 * We need interrupts for the admin queue and each non-polled I/O queue,
	 * but some Apple controllers require all queues to use the first
	 * vector.
2221
	 */
2222 2223 2224
	irq_queues = 1;
	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
		irq_queues += (nr_io_queues - poll_queues);
2225 2226
	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2227 2228
}

2229 2230 2231 2232 2233 2234
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}

2235 2236
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
2237 2238 2239 2240 2241 2242
	/*
	 * If tags are shared with admin queue (Apple bug), then
	 * make sure we only use one IO queue.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
		return 1;
2243 2244 2245
	return num_possible_cpus() + dev->nr_write_queues + dev->nr_poll_queues;
}

2246
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2247
{
2248
	struct nvme_queue *adminq = &dev->queues[0];
2249
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2250
	unsigned int nr_io_queues;
2251
	unsigned long size;
2252
	int result;
M
Matthew Wilcox 已提交
2253

2254 2255 2256 2257 2258 2259
	/*
	 * Sample the module parameters once at reset time so that we have
	 * stable values to work with.
	 */
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
2260

2261
	nr_io_queues = dev->nr_allocated_queues - 1;
C
Christoph Hellwig 已提交
2262 2263
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2264
		return result;
C
Christoph Hellwig 已提交
2265

2266
	if (nr_io_queues == 0)
2267
		return 0;
2268

2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280
	/*
	 * Free IRQ resources as soon as NVMEQ_ENABLED bit transitions
	 * from set to unset. If there is a window to it is truely freed,
	 * pci_free_irq_vectors() jumping into this window will crash.
	 * And take lock to avoid racing with pci_free_irq_vectors() in
	 * nvme_dev_disable() path.
	 */
	result = nvme_setup_io_queues_trylock(dev);
	if (result)
		return result;
	if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
		pci_free_irq(pdev, 0, adminq);
M
Matthew Wilcox 已提交
2281

2282
	if (dev->cmb_use_sqes) {
2283 2284 2285 2286 2287
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2288
			dev->cmb_use_sqes = false;
2289 2290
	}

2291 2292 2293 2294 2295
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
2296 2297 2298 2299
		if (!--nr_io_queues) {
			result = -ENOMEM;
			goto out_unlock;
		}
2300 2301
	} while (1);
	adminq->q_db = dev->dbs;
2302

2303
 retry:
K
Keith Busch 已提交
2304
	/* Deregister the admin queue's interrupt */
2305 2306
	if (test_and_clear_bit(NVMEQ_ENABLED, &adminq->flags))
		pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2307

2308 2309 2310 2311
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2312
	pci_free_irq_vectors(pdev);
2313 2314

	result = nvme_setup_irqs(dev, nr_io_queues);
2315 2316 2317 2318
	if (result <= 0) {
		result = -EIO;
		goto out_unlock;
	}
2319

2320
	dev->num_vecs = result;
J
Jens Axboe 已提交
2321
	result = max(result - 1, 1);
2322
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2323

2324 2325 2326 2327 2328 2329
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
2330
	result = queue_request_irq(adminq);
2331
	if (result)
2332
		goto out_unlock;
2333
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2334
	mutex_unlock(&dev->shutdown_lock);
2335 2336 2337 2338 2339 2340 2341 2342

	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_disable_io_queues(dev);
2343 2344 2345
		result = nvme_setup_io_queues_trylock(dev);
		if (result)
			return result;
2346 2347 2348 2349 2350 2351 2352 2353
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
2354 2355 2356
out_unlock:
	mutex_unlock(&dev->shutdown_lock);
	return result;
M
Matthew Wilcox 已提交
2357 2358
}

2359
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2360
{
K
Keith Busch 已提交
2361
	struct nvme_queue *nvmeq = req->end_io_data;
2362

K
Keith Busch 已提交
2363
	blk_mq_free_request(req);
2364
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2365 2366
}

2367
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2368
{
K
Keith Busch 已提交
2369
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2370

2371 2372
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2373 2374

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2375 2376
}

K
Keith Busch 已提交
2377
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2378
{
K
Keith Busch 已提交
2379 2380
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
2381
	struct nvme_command cmd = { };
2382

K
Keith Busch 已提交
2383 2384
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2385

2386
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT);
K
Keith Busch 已提交
2387 2388
	if (IS_ERR(req))
		return PTR_ERR(req);
2389

K
Keith Busch 已提交
2390 2391
	req->end_io_data = nvmeq;

2392
	init_completion(&nvmeq->delete_done);
2393
	blk_execute_rq_nowait(NULL, req, false,
K
Keith Busch 已提交
2394 2395 2396
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2397 2398
}

2399
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2400
{
2401
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2402
	unsigned long timeout;
K
Keith Busch 已提交
2403

K
Keith Busch 已提交
2404
 retry:
2405
	timeout = NVME_ADMIN_TIMEOUT;
2406 2407 2408 2409 2410
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2411
	}
2412 2413 2414 2415
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2416 2417 2418
				timeout);
		if (timeout == 0)
			return false;
2419 2420

		sent--;
2421 2422 2423 2424
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2425 2426
}

K
Keith Busch 已提交
2427
static void nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2428
{
2429 2430
	int ret;

2431
	if (!dev->ctrl.tagset) {
2432
		dev->tagset.ops = &nvme_mq_ops;
2433
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2434
		dev->tagset.nr_maps = 2; /* default + read */
2435 2436
		if (dev->io_queues[HCTX_TYPE_POLL])
			dev->tagset.nr_maps++;
2437
		dev->tagset.timeout = NVME_IO_TIMEOUT;
2438
		dev->tagset.numa_node = dev->ctrl.numa_node;
2439 2440
		dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
						BLK_MQ_MAX_DEPTH) - 1;
2441
		dev->tagset.cmd_size = sizeof(struct nvme_iod);
2442 2443
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2444

2445 2446 2447 2448 2449 2450 2451 2452
		/*
		 * Some Apple controllers requires tags to be unique
		 * across admin and IO queue, so reserve the first 32
		 * tags of the IO queue.
		 */
		if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
			dev->tagset.reserved_tags = NVME_AQ_DEPTH;

2453 2454 2455 2456
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
K
Keith Busch 已提交
2457
			return;
2458
		}
2459
		dev->ctrl.tagset = &dev->tagset;
2460 2461 2462 2463 2464
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2465
	}
2466

2467
	nvme_dbbuf_set(dev);
M
Matthew Wilcox 已提交
2468 2469
}

2470
static int nvme_pci_enable(struct nvme_dev *dev)
2471
{
2472
	int result = -ENOMEM;
2473
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2474
	int dma_address_bits = 64;
2475 2476 2477 2478 2479 2480

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2481 2482 2483
	if (dev->ctrl.quirks & NVME_QUIRK_DMA_ADDRESS_BITS_48)
		dma_address_bits = 48;
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(dma_address_bits)))
2484
		goto disable;
2485

2486
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2487
		result = -ENODEV;
2488
		goto disable;
K
Keith Busch 已提交
2489
	}
2490 2491

	/*
2492 2493 2494
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2495
	 */
2496 2497 2498
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2499

2500
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2501

2502
	dev->q_depth = min_t(u32, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2503
				io_queue_depth);
2504
	dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */
2505
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2506
	dev->dbs = dev->bar + 4096;
2507

2508 2509 2510 2511 2512 2513 2514 2515 2516
	/*
	 * Some Apple controllers require a non-standard SQE size.
	 * Interestingly they also seem to ignore the CC:IOSQES register
	 * so we don't bother updating it here.
	 */
	if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES)
		dev->io_sqes = 7;
	else
		dev->io_sqes = NVME_NVM_IOSQES;
2517 2518 2519 2520 2521 2522 2523

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2524 2525
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2526
			dev->q_depth);
2527 2528
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2529
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2530 2531 2532
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2533 2534
	}

2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546
	/*
	 * Controllers with the shared tags quirk need the IO queue to be
	 * big enough so that we get 32 tags for the admin queue
	 */
	if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) &&
	    (dev->q_depth < (NVME_AQ_DEPTH + 2))) {
		dev->q_depth = NVME_AQ_DEPTH + 2;
		dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n",
			 dev->q_depth);
	}


2547
	nvme_map_cmb(dev);
2548

K
Keith Busch 已提交
2549 2550
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2551 2552 2553 2554 2555 2556 2557 2558
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2559 2560 2561
{
	if (dev->bar)
		iounmap(dev->bar);
2562
	pci_release_mem_regions(to_pci_dev(dev->dev));
2563 2564 2565
}

static void nvme_pci_disable(struct nvme_dev *dev)
2566
{
2567 2568
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2569
	pci_free_irq_vectors(pdev);
2570

K
Keith Busch 已提交
2571 2572
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2573
		pci_disable_device(pdev);
K
Keith Busch 已提交
2574 2575 2576
	}
}

2577
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2578
{
2579
	bool dead = true, freeze = false;
K
Keith Busch 已提交
2580
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2581

2582
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2583 2584 2585
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2586
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
2587 2588
		    dev->ctrl.state == NVME_CTRL_RESETTING) {
			freeze = true;
K
Keith Busch 已提交
2589
			nvme_start_freeze(&dev->ctrl);
2590
		}
K
Keith Busch 已提交
2591 2592
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2593
	}
2594

K
Keith Busch 已提交
2595 2596 2597 2598
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2599 2600
	if (!dead && shutdown && freeze)
		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2601 2602

	nvme_stop_queues(&dev->ctrl);
2603

2604
	if (!dead && dev->ctrl.queue_count > 0) {
2605
		nvme_disable_io_queues(dev);
2606
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2607
	}
2608 2609
	nvme_suspend_io_queues(dev);
	nvme_suspend_queue(&dev->queues[0]);
2610
	nvme_pci_disable(dev);
2611
	nvme_reap_pending_cqes(dev);
2612

2613 2614
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
2615 2616
	blk_mq_tagset_wait_completed_request(&dev->tagset);
	blk_mq_tagset_wait_completed_request(&dev->admin_tagset);
K
Keith Busch 已提交
2617 2618 2619 2620 2621 2622

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
2623
	if (shutdown) {
K
Keith Busch 已提交
2624
		nvme_start_queues(&dev->ctrl);
2625 2626 2627
		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
	}
2628
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2629 2630
}

2631 2632 2633 2634 2635 2636 2637 2638
static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
	if (!nvme_wait_reset(&dev->ctrl))
		return -EBUSY;
	nvme_dev_disable(dev, shutdown);
	return 0;
}

M
Matthew Wilcox 已提交
2639 2640
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2641
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
C
Christoph Hellwig 已提交
2642 2643
						NVME_CTRL_PAGE_SIZE,
						NVME_CTRL_PAGE_SIZE, 0);
M
Matthew Wilcox 已提交
2644 2645 2646
	if (!dev->prp_page_pool)
		return -ENOMEM;

2647
	/* Optimisation for I/Os between 4k and 128k */
2648
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2649 2650 2651 2652 2653
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2654 2655 2656 2657 2658 2659
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2660
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2661 2662
}

2663 2664 2665 2666 2667 2668 2669
static void nvme_free_tagset(struct nvme_dev *dev)
{
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	dev->ctrl.tagset = NULL;
}

2670
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2671
{
2672
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2673

2674
	nvme_dbbuf_dma_free(dev);
2675
	nvme_free_tagset(dev);
2676 2677
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2678
	free_opal_dev(dev->ctrl.opal_dev);
2679
	mempool_destroy(dev->iod_mempool);
2680 2681
	put_device(dev->dev);
	kfree(dev->queues);
2682 2683 2684
	kfree(dev);
}

2685
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2686
{
2687 2688 2689 2690 2691
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
2692
	nvme_get_ctrl(&dev->ctrl);
2693
	nvme_dev_disable(dev, false);
2694
	nvme_kill_queues(&dev->ctrl);
2695
	if (!queue_work(nvme_wq, &dev->remove_work))
2696 2697 2698
		nvme_put_ctrl(&dev->ctrl);
}

2699
static void nvme_reset_work(struct work_struct *work)
2700
{
2701 2702
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2703
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2704
	int result;
2705

2706 2707 2708
	if (dev->ctrl.state != NVME_CTRL_RESETTING) {
		dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
			 dev->ctrl.state);
2709
		result = -ENODEV;
2710
		goto out;
2711
	}
2712

2713 2714 2715 2716
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2717
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2718
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2719
	nvme_sync_queues(&dev->ctrl);
2720

2721
	mutex_lock(&dev->shutdown_lock);
2722
	result = nvme_pci_enable(dev);
2723
	if (result)
2724
		goto out_unlock;
2725

2726
	result = nvme_pci_configure_admin_queue(dev);
2727
	if (result)
2728
		goto out_unlock;
2729

K
Keith Busch 已提交
2730 2731
	result = nvme_alloc_admin_tags(dev);
	if (result)
2732
		goto out_unlock;
2733

2734 2735 2736 2737
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
2738 2739
	dev->ctrl.max_hw_sectors = min_t(u32,
		NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
2740
	dev->ctrl.max_segments = NVME_MAX_SEGS;
2741 2742 2743 2744 2745

	/*
	 * Don't limit the IOMMU merged segment size.
	 */
	dma_set_max_seg_size(dev->dev, 0xffffffff);
J
Jianxiong Gao 已提交
2746
	dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
2747

2748 2749 2750 2751 2752 2753 2754 2755 2756
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
2757
		result = -EBUSY;
2758 2759
		goto out;
	}
2760

2761 2762 2763 2764 2765 2766
	/*
	 * We do not support an SGL for metadata (yet), so we are limited to a
	 * single integrity segment for the separate metadata pointer.
	 */
	dev->ctrl.max_integrity_segments = 1;

2767
	result = nvme_init_ctrl_finish(&dev->ctrl);
2768
	if (result)
2769
		goto out;
2770

2771 2772 2773 2774 2775 2776 2777 2778 2779
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2780
	}
2781

2782 2783 2784 2785 2786 2787 2788
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2789 2790 2791 2792 2793
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2794

2795
	result = nvme_setup_io_queues(dev);
2796
	if (result)
2797
		goto out;
2798

2799 2800 2801 2802
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2803
	if (dev->online_queues < 2) {
2804
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2805
		nvme_kill_queues(&dev->ctrl);
2806
		nvme_remove_namespaces(&dev->ctrl);
2807
		nvme_free_tagset(dev);
2808
	} else {
2809
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2810
		nvme_wait_freeze(&dev->ctrl);
K
Keith Busch 已提交
2811
		nvme_dev_add(dev);
K
Keith Busch 已提交
2812
		nvme_unfreeze(&dev->ctrl);
2813 2814
	}

2815 2816 2817 2818
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
K
Keith Busch 已提交
2819
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
2820
		dev_warn(dev->ctrl.device,
K
Keith Busch 已提交
2821
			"failed to mark controller live state\n");
2822
		result = -ENODEV;
2823 2824
		goto out;
	}
2825

2826 2827 2828 2829
	if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
			&nvme_pci_attr_group))
		dev->attrs_added = true;

2830
	nvme_start_ctrl(&dev->ctrl);
2831
	return;
2832

2833 2834
 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
2835
 out:
2836 2837 2838 2839
	if (result)
		dev_warn(dev->ctrl.device,
			 "Removing after probe failure status: %d\n", result);
	nvme_remove_dead_ctrl(dev);
2840 2841
}

2842
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2843
{
2844
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2845
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2846 2847

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2848
		device_release_driver(&pdev->dev);
2849
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2850 2851
}

2852
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2853
{
2854
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2855
	return 0;
T
Tejun Heo 已提交
2856 2857
}

2858
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2859
{
2860 2861 2862
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2863

2864 2865
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
2866
	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
2867
	return 0;
2868 2869
}

2870 2871 2872 2873
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

2874
	return snprintf(buf, size, "%s\n", dev_name(&pdev->dev));
2875 2876
}

2877
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2878
	.name			= "pcie",
2879
	.module			= THIS_MODULE,
2880 2881
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2882
	.reg_read32		= nvme_pci_reg_read32,
2883
	.reg_write32		= nvme_pci_reg_write32,
2884
	.reg_read64		= nvme_pci_reg_read64,
2885
	.free_ctrl		= nvme_pci_free_ctrl,
2886
	.submit_async_event	= nvme_pci_submit_async_event,
2887
	.get_address		= nvme_pci_get_address,
2888
};
2889

2890 2891 2892 2893
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2894
	if (pci_request_mem_regions(pdev, "nvme"))
2895 2896
		return -ENODEV;

2897
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2898 2899
		goto release;

M
Max Gurtovoy 已提交
2900
	return 0;
2901
  release:
M
Max Gurtovoy 已提交
2902 2903
	pci_release_mem_regions(pdev);
	return -ENODEV;
2904 2905
}

2906
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2921 2922 2923
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2924 2925 2926
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2927 2928
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2929 2930
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2931
			return NVME_QUIRK_NO_APST;
2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943
	} else if ((pdev->vendor == 0x144d && (pdev->device == 0xa801 ||
		    pdev->device == 0xa808 || pdev->device == 0xa809)) ||
		   (pdev->vendor == 0x1e0f && pdev->device == 0x0001)) {
		/*
		 * Forcing to use host managed nvme power settings for
		 * lowest idle power with quick resume latency on
		 * Samsung and Toshiba SSDs based on suspend behavior
		 * on Coffee Lake board for LENOVO C640
		 */
		if ((dmi_match(DMI_BOARD_VENDOR, "LENOVO")) &&
		     dmi_match(DMI_BOARD_NAME, "LNVNB161216"))
			return NVME_QUIRK_SIMPLE_SUSPEND;
2944 2945 2946 2947 2948
	}

	return 0;
}

2949 2950 2951
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2952

2953
	flush_work(&dev->ctrl.reset_work);
2954
	flush_work(&dev->ctrl.scan_work);
2955
	nvme_put_ctrl(&dev->ctrl);
2956 2957
}

2958
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2959
{
M
Matias Bjørling 已提交
2960
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2961
	struct nvme_dev *dev;
2962
	unsigned long quirks = id->driver_data;
2963
	size_t alloc_size;
M
Matthew Wilcox 已提交
2964

M
Matias Bjørling 已提交
2965 2966
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2967
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2968 2969

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2970 2971
	if (!dev)
		return -ENOMEM;
2972

2973 2974 2975 2976 2977
	dev->nr_write_queues = write_queues;
	dev->nr_poll_queues = poll_queues;
	dev->nr_allocated_queues = nvme_max_io_queues(dev) + 1;
	dev->queues = kcalloc_node(dev->nr_allocated_queues,
			sizeof(struct nvme_queue), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2978 2979 2980
	if (!dev->queues)
		goto free;

2981
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2982
	pci_set_drvdata(pdev, dev);
2983

2984 2985
	result = nvme_dev_map(dev);
	if (result)
2986
		goto put_pci;
2987

2988
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2989
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2990
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2991

M
Matthew Wilcox 已提交
2992 2993
	result = nvme_setup_prp_pools(dev);
	if (result)
2994
		goto unmap;
2995

2996
	quirks |= check_vendor_combination_bug(pdev);
2997

2998
	if (!noacpi && acpi_storage_d3(&pdev->dev)) {
2999 3000 3001 3002 3003 3004 3005 3006 3007
		/*
		 * Some systems use a bios work around to ask for D3 on
		 * platforms that support kernel managed suspend.
		 */
		dev_info(&pdev->dev,
			 "platform quirk: setting simple suspend\n");
		quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
	}

3008 3009 3010 3011
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
3012
	alloc_size = nvme_pci_iod_alloc_size();
3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

3024 3025 3026 3027 3028
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

3029 3030
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

3031
	nvme_reset_ctrl(&dev->ctrl);
3032
	async_schedule(nvme_async_probe, dev);
3033

M
Matthew Wilcox 已提交
3034 3035
	return 0;

3036 3037
 release_mempool:
	mempool_destroy(dev->iod_mempool);
3038
 release_pools:
M
Matthew Wilcox 已提交
3039
	nvme_release_prp_pools(dev);
3040 3041
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
3042
 put_pci:
3043
	put_device(dev->dev);
M
Matthew Wilcox 已提交
3044 3045 3046 3047 3048 3049
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

3050
static void nvme_reset_prepare(struct pci_dev *pdev)
3051
{
K
Keith Busch 已提交
3052
	struct nvme_dev *dev = pci_get_drvdata(pdev);
3053 3054 3055 3056 3057 3058 3059 3060

	/*
	 * We don't need to check the return value from waiting for the reset
	 * state as pci_dev device lock is held, making it impossible to race
	 * with ->remove().
	 */
	nvme_disable_prepare_reset(dev, false);
	nvme_sync_queues(&dev->ctrl);
3061
}
3062

3063 3064
static void nvme_reset_done(struct pci_dev *pdev)
{
3065
	struct nvme_dev *dev = pci_get_drvdata(pdev);
3066 3067 3068

	if (!nvme_try_sched_reset(&dev->ctrl))
		flush_work(&dev->ctrl.reset_work);
3069 3070
}

3071 3072 3073
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
3074

3075
	nvme_disable_prepare_reset(dev, true);
3076 3077
}

3078 3079 3080 3081 3082 3083 3084
static void nvme_remove_attrs(struct nvme_dev *dev)
{
	if (dev->attrs_added)
		sysfs_remove_group(&dev->ctrl.device->kobj,
				   &nvme_pci_attr_group);
}

3085 3086 3087 3088 3089
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
3090
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
3091 3092
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
3093

3094
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
3095
	pci_set_drvdata(pdev, NULL);
3096

3097
	if (!pci_device_is_present(pdev)) {
3098
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
3099
		nvme_dev_disable(dev, true);
3100
	}
3101

3102
	flush_work(&dev->ctrl.reset_work);
3103 3104
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
3105
	nvme_dev_disable(dev, true);
3106
	nvme_remove_attrs(dev);
3107
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
3108
	nvme_dev_remove_admin(dev);
3109
	nvme_free_queues(dev, 0);
K
Keith Busch 已提交
3110
	nvme_release_prp_pools(dev);
3111
	nvme_dev_unmap(dev);
3112
	nvme_uninit_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
3113 3114
}

3115
#ifdef CONFIG_PM_SLEEP
3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130
static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
}

static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
{
	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}

static int nvme_resume(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
	struct nvme_ctrl *ctrl = &ndev->ctrl;

3131
	if (ndev->last_ps == U32_MAX ||
3132
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
3133 3134 3135 3136
		goto reset;
	if (ctrl->hmpre && nvme_setup_host_mem(ndev))
		goto reset;

3137
	return 0;
3138 3139
reset:
	return nvme_try_sched_reset(ctrl);
3140 3141
}

3142 3143 3144 3145
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);
3146 3147 3148
	struct nvme_ctrl *ctrl = &ndev->ctrl;
	int ret = -EBUSY;

3149 3150
	ndev->last_ps = U32_MAX;

3151 3152 3153 3154 3155 3156 3157
	/*
	 * The platform does not remove power for a kernel managed suspend so
	 * use host managed nvme power settings for lowest idle power if
	 * possible. This should have quicker resume latency than a full device
	 * shutdown.  But if the firmware is involved after the suspend or the
	 * device does not support any non-default power states, shut down the
	 * device fully.
3158 3159 3160 3161 3162
	 *
	 * If ASPM is not enabled for the device, shut down the device and allow
	 * the PCI bus layer to put it into D3 in order to take the PCIe link
	 * down, so as to allow the platform to achieve its minimum low-power
	 * state (which may not be possible if the link is up).
3163
	 */
3164
	if (pm_suspend_via_firmware() || !ctrl->npss ||
3165
	    !pcie_aspm_enabled(pdev) ||
3166 3167
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);
3168 3169 3170 3171 3172

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

K
Keith Busch 已提交
3173
	if (ctrl->state != NVME_CTRL_LIVE)
3174 3175
		goto unfreeze;

3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186
	/*
	 * Host memory access may not be successful in a system suspend state,
	 * but the specification allows the controller to access memory in a
	 * non-operational power state.
	 */
	if (ndev->hmb) {
		ret = nvme_set_host_mem(ndev, 0);
		if (ret < 0)
			goto unfreeze;
	}

3187 3188 3189 3190
	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;

3191 3192 3193 3194 3195 3196 3197
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);

3198 3199 3200 3201 3202
	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
3203 3204 3205
		/* discard the saved state */
		pci_load_saved_state(pdev, NULL);

3206 3207
		/*
		 * Clearing npss forces a controller reset on resume. The
3208
		 * correct value will be rediscovered then.
3209
		 */
3210
		ret = nvme_disable_prepare_reset(ndev, true);
3211 3212 3213 3214 3215 3216 3217 3218 3219 3220
		ctrl->npss = 0;
	}
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
}

static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
3221

3222
	return nvme_disable_prepare_reset(ndev, true);
3223 3224
}

3225
static int nvme_simple_resume(struct device *dev)
3226 3227 3228 3229
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

3230
	return nvme_try_sched_reset(&ndev->ctrl);
3231 3232
}

3233
static const struct dev_pm_ops nvme_dev_pm_ops = {
3234 3235 3236 3237 3238 3239 3240 3241
	.suspend	= nvme_suspend,
	.resume		= nvme_resume,
	.freeze		= nvme_simple_suspend,
	.thaw		= nvme_simple_resume,
	.poweroff	= nvme_simple_suspend,
	.restore	= nvme_simple_resume,
};
#endif /* CONFIG_PM_SLEEP */
M
Matthew Wilcox 已提交
3242

K
Keith Busch 已提交
3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
3257 3258
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
3259
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
3260 3261
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
3262 3263
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
3264 3265 3266 3267 3268 3269 3270 3271 3272
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

3273
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
3274
	pci_restore_state(pdev);
3275
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
3276 3277 3278 3279 3280
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
3281 3282 3283
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
3284 3285
}

3286
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
3287 3288 3289
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
3290 3291
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
3292 3293
};

3294
static const struct pci_device_id nvme_id_table[] = {
3295
	{ PCI_VDEVICE(INTEL, 0x0953),	/* Intel 750/P3500/P3600/P3700 */
3296
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3297
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3298
	{ PCI_VDEVICE(INTEL, 0x0a53),	/* Intel P3520 */
3299
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3300
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3301
	{ PCI_VDEVICE(INTEL, 0x0a54),	/* Intel P4500/P4600 */
3302
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
3303
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3304
	{ PCI_VDEVICE(INTEL, 0x0a55),	/* Dell Express Flash P4600 */
3305 3306
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3307
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
3308
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
3309
				NVME_QUIRK_MEDIUM_PRIO_SQ |
3310 3311
				NVME_QUIRK_NO_TEMP_THRESH_CHANGE |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3312 3313
	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3314
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
3315 3316
		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3317 3318
	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
3319
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
3320 3321
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
				NVME_QUIRK_NO_NS_DESC_LIST, },
3322 3323
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3324 3325
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3326 3327
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3328 3329 3330
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
3331
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY |
3332
				NVME_QUIRK_DISABLE_WRITE_ZEROES|
3333
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3334 3335
	{ PCI_DEVICE(0x1987, 0x5016),	/* Phison E16 */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3336 3337 3338
	{ PCI_DEVICE(0x1b4b, 0x1092),	/* Lexar 256 GB SSD */
		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3339 3340
	{ PCI_DEVICE(0x10ec, 0x5762),   /* ADATA SX6000LNP */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3341 3342 3343
	{ PCI_DEVICE(0x1cc1, 0x8201),   /* ADATA SX8200PNP 512GB */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3344 3345
	{ PCI_DEVICE(0x1c5c, 0x1504),   /* SK Hynix PC400 */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3346 3347
	{ PCI_DEVICE(0x15b7, 0x2001),   /*  Sandisk Skyhawk */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3348 3349
	{ PCI_DEVICE(0x1d97, 0x2263),   /* SPCC */
		.driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3350 3351
	{ PCI_DEVICE(0x2646, 0x2262),   /* KINGSTON SKC2000 NVMe SSD */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3352 3353
	{ PCI_DEVICE(0x2646, 0x2263),   /* KINGSTON A2000 NVMe SSD  */
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS, },
3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0065),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x8061),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd00),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd01),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
	{ PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0xcd02),
		.driver_data = NVME_QUIRK_DMA_ADDRESS_BITS_48, },
3366 3367
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR },
3368
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
3369 3370
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
		.driver_data = NVME_QUIRK_SINGLE_VECTOR |
3371 3372
				NVME_QUIRK_128_BYTES_SQES |
				NVME_QUIRK_SHARED_TAGS },
3373 3374

	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
M
Matthew Wilcox 已提交
3375 3376 3377 3378 3379 3380 3381 3382
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
3383
	.remove		= nvme_remove,
3384
	.shutdown	= nvme_shutdown,
3385
#ifdef CONFIG_PM_SLEEP
3386 3387 3388
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
3389
#endif
3390
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
3391 3392 3393 3394 3395
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
3396 3397 3398
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3399
	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3400

3401
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
3402 3403 3404 3405 3406
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3407
	flush_workqueue(nvme_wq);
M
Matthew Wilcox 已提交
3408 3409 3410 3411
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3412
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3413 3414
module_init(nvme_init);
module_exit(nvme_exit);