pci.c 78.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
M
Matthew Wilcox 已提交
2 3
/*
 * NVM Express device driver
4
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
5 6
 */

K
Keith Busch 已提交
7
#include <linux/aer.h>
8
#include <linux/async.h>
M
Matthew Wilcox 已提交
9
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
10
#include <linux/blk-mq.h>
11
#include <linux/blk-mq-pci.h>
12
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
13 14 15 16 17
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
18
#include <linux/mutex.h>
19
#include <linux/once.h>
M
Matthew Wilcox 已提交
20
#include <linux/pci.h>
21
#include <linux/suspend.h>
K
Keith Busch 已提交
22
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
23
#include <linux/types.h>
24
#include <linux/io-64-nonatomic-lo-hi.h>
25
#include <linux/sed-opal.h>
26
#include <linux/pci-p2pdma.h>
27

Y
yupeng 已提交
28
#include "trace.h"
29 30
#include "nvme.h"

M
Matthew Wilcox 已提交
31 32
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
33

C
Chaitanya Kulkarni 已提交
34
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
35

36 37 38 39 40 41 42
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

43 44 45
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

46
static bool use_cmb_sqes = true;
47
module_param(use_cmb_sqes, bool, 0444);
48 49
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

50 51 52 53
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
54

C
Chaitanya Kulkarni 已提交
55 56 57 58 59 60
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

61 62 63 64 65 66 67 68 69 70
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
	.get = param_get_int,
};

static int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

71
static int write_queues;
72
module_param(write_queues, int, 0644);
73 74 75 76
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

77
static int poll_queues;
78
module_param(poll_queues, int, 0644);
J
Jens Axboe 已提交
79 80
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

81 82
struct nvme_dev;
struct nvme_queue;
83

84
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
85
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
86

87 88 89 90
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
91
	struct nvme_queue *queues;
92 93 94 95 96 97 98 99
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
100
	unsigned io_queues[HCTX_MAX_TYPES];
101
	unsigned int num_vecs;
102 103 104
	int q_depth;
	u32 db_stride;
	void __iomem *bar;
105
	unsigned long bar_mapped_size;
106
	struct work_struct remove_work;
107
	struct mutex shutdown_lock;
108 109
	bool subsystem;
	u64 cmb_size;
110
	bool cmb_use_sqes;
111
	u32 cmbsz;
112
	u32 cmbloc;
113
	struct nvme_ctrl ctrl;
114
	u32 last_ps;
115

116 117
	mempool_t *iod_mempool;

118
	/* shadow doorbell buffer support: */
119 120 121 122
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
123 124 125 126

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
127
	dma_addr_t host_mem_descs_dma;
128 129
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
K
Keith Busch 已提交
130
};
131

132 133 134 135 136 137 138 139 140 141 142
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
	int n = 0, ret;

	ret = kstrtoint(val, 10, &n);
	if (ret != 0 || n < 2)
		return -EINVAL;

	return param_set_int(val, kp);
}

143 144 145 146 147 148 149 150 151 152
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

153 154 155 156 157
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
158 159 160 161 162
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
M
Matthew Wilcox 已提交
163
	struct nvme_dev *dev;
164
	spinlock_t sq_lock;
M
Matthew Wilcox 已提交
165
	struct nvme_command *sq_cmds;
166 167
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
M
Matthew Wilcox 已提交
168
	volatile struct nvme_completion *cqes;
169
	struct blk_mq_tags **tags;
M
Matthew Wilcox 已提交
170 171 172 173
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
174
	u16 cq_vector;
M
Matthew Wilcox 已提交
175
	u16 sq_tail;
176
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
177
	u16 cq_head;
178
	u16 last_cq_head;
K
Keith Busch 已提交
179
	u16 qid;
180
	u8 cq_phase;
181 182
	unsigned long flags;
#define NVMEQ_ENABLED		0
183
#define NVMEQ_SQ_CMB		1
184
#define NVMEQ_DELETE_ERROR	2
185
#define NVMEQ_POLLED		3
186 187 188 189
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
190
	struct completion delete_done;
M
Matthew Wilcox 已提交
191 192
};

193
/*
194 195 196 197
 * The nvme_iod describes the data in an I/O.
 *
 * The sg pointer contains the list of PRP/SGL chunk allocations in addition
 * to the actual struct scatterlist.
198 199
 */
struct nvme_iod {
200
	struct nvme_request req;
C
Christoph Hellwig 已提交
201
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
202
	bool use_sgl;
C
Christoph Hellwig 已提交
203
	int aborted;
204 205 206
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	dma_addr_t first_dma;
207
	unsigned int dma_len;	/* length of single DMA segment mapping */
208
	dma_addr_t meta_dma;
C
Christoph Hellwig 已提交
209
	struct scatterlist *sg;
M
Matthew Wilcox 已提交
210 211
};

212 213
static unsigned int max_io_queues(void)
{
J
Jens Axboe 已提交
214
	return num_possible_cpus() + write_queues + poll_queues;
215 216 217 218 219 220 221 222
}

static unsigned int max_queue_count(void)
{
	/* IO queues + admin queue */
	return 1 + max_io_queues();
}

223 224
static inline unsigned int nvme_dbbuf_size(u32 stride)
{
225
	return (max_queue_count() * 8 * stride);
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
294
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

321 322 323 324 325 326 327 328
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

329 330 331 332 333
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
334 335
}

336 337 338 339 340 341 342
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
343 344
	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				      dev->ctrl.page_size);
345 346 347
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
348 349 350 351 352
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
static int nvme_pci_npages_sgl(unsigned int num_seg)
353
{
C
Chaitanya Kulkarni 已提交
354
	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
C
Christoph Hellwig 已提交
355
}
356

C
Chaitanya Kulkarni 已提交
357 358
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
		unsigned int size, unsigned int nseg, bool use_sgl)
C
Christoph Hellwig 已提交
359
{
C
Chaitanya Kulkarni 已提交
360 361 362 363 364 365 366 367
	size_t alloc_size;

	if (use_sgl)
		alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
	else
		alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);

	return alloc_size + sizeof(struct scatterlist) * nseg;
C
Christoph Hellwig 已提交
368
}
369

M
Matias Bjørling 已提交
370 371
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
372
{
M
Matias Bjørling 已提交
373
	struct nvme_dev *dev = data;
374
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
375

376 377 378 379
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
	WARN_ON(nvmeq->tags);

M
Matias Bjørling 已提交
380
	hctx->driver_data = nvmeq;
381
	nvmeq->tags = &dev->admin_tagset.tags[0];
M
Matias Bjørling 已提交
382
	return 0;
383 384
}

385 386 387 388 389 390 391
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	nvmeq->tags = NULL;
}

M
Matias Bjørling 已提交
392 393
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
394
{
M
Matias Bjørling 已提交
395
	struct nvme_dev *dev = data;
396
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
397

398 399
	if (!nvmeq->tags)
		nvmeq->tags = &dev->tagset.tags[hctx_idx];
M
Matthew Wilcox 已提交
400

401
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
402 403
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
404 405
}

406 407
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
408
{
409
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
410
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
411
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
412
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
413 414

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
415
	iod->nvmeq = nvmeq;
416 417

	nvme_req(req)->ctrl = &dev->ctrl;
M
Matias Bjørling 已提交
418 419 420
	return 0;
}

421 422 423 424 425 426 427 428 429
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

430 431 432
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
433 434 435 436 437 438 439 440
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
441
			BUG_ON(i == HCTX_TYPE_DEFAULT);
442
			continue;
443 444
		}

J
Jens Axboe 已提交
445 446 447 448
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
449
		map->queue_offset = qoff;
450
		if (i != HCTX_TYPE_POLL && offset)
J
Jens Axboe 已提交
451 452 453
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
454 455 456 457 458
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
459 460
}

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
	nvmeq->last_sq_tail = nvmeq->sq_tail;
}

M
Matthew Wilcox 已提交
481
/**
482
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
483 484
 * @nvmeq: The queue to use
 * @cmd: The command to send
485
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
486
 */
487 488
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
489
{
490
	spin_lock(&nvmeq->sq_lock);
491
	memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
492 493
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
494 495 496 497 498 499 500 501 502 503 504
	nvme_write_sq_db(nvmeq, write_sq);
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
505
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
506 507
}

C
Chaitanya Kulkarni 已提交
508
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
509
{
C
Christoph Hellwig 已提交
510
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
511
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
512 513
}

514 515 516
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
517
	int nseg = blk_rq_nr_phys_segments(req);
518 519
	unsigned int avg_seg_size;

520 521 522 523
	if (nseg == 0)
		return false;

	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
524 525 526 527 528 529 530 531 532 533

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

534
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
535
{
C
Christoph Hellwig 已提交
536
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
537 538
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;
C
Chaitanya Kulkarni 已提交
539 540
	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
541 542
	int i;

543 544 545
	if (iod->dma_len) {
		dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir);
		return;
546 547
	}

548 549 550 551 552 553 554
	WARN_ON_ONCE(!iod->nents);

	/* P2PDMA requests do not need to be unmapped */
	if (!is_pci_p2pdma_page(sg_page(iod->sg)))
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));


555
	if (iod->npages == 0)
C
Chaitanya Kulkarni 已提交
556 557 558
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			dma_addr);

559
	for (i = 0; i < iod->npages; i++) {
C
Chaitanya Kulkarni 已提交
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574
		void *addr = nvme_pci_iod_list(req)[i];

		if (iod->use_sgl) {
			struct nvme_sgl_desc *sg_list = addr;

			next_dma_addr =
			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
		} else {
			__le64 *prp_list = addr;

			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
		}

		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
		dma_addr = next_dma_addr;
575
	}
576

577
	mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
578 579
}

580 581 582 583 584 585 586 587 588 589 590 591 592 593
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
594 595
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
596
{
C
Christoph Hellwig 已提交
597
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
598
	struct dma_pool *pool;
599
	int length = blk_rq_payload_bytes(req);
600
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
601 602
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
603
	u32 page_size = dev->ctrl.page_size;
604
	int offset = dma_addr & (page_size - 1);
605
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
606
	void **list = nvme_pci_iod_list(req);
607
	dma_addr_t prp_dma;
608
	int nprps, i;
M
Matthew Wilcox 已提交
609

610
	length -= (page_size - offset);
611 612
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
613
		goto done;
614
	}
M
Matthew Wilcox 已提交
615

616
	dma_len -= (page_size - offset);
M
Matthew Wilcox 已提交
617
	if (dma_len) {
618
		dma_addr += (page_size - offset);
M
Matthew Wilcox 已提交
619 620 621 622 623 624
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

625
	if (length <= page_size) {
626
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
627
		goto done;
628 629
	}

630
	nprps = DIV_ROUND_UP(length, page_size);
631 632
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
633
		iod->npages = 0;
634 635
	} else {
		pool = dev->prp_page_pool;
636
		iod->npages = 1;
637 638
	}

639
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
640
	if (!prp_list) {
641
		iod->first_dma = dma_addr;
642
		iod->npages = -1;
643
		return BLK_STS_RESOURCE;
644
	}
645 646
	list[0] = prp_list;
	iod->first_dma = prp_dma;
647 648
	i = 0;
	for (;;) {
649
		if (i == page_size >> 3) {
650
			__le64 *old_prp_list = prp_list;
651
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
652
			if (!prp_list)
653
				return BLK_STS_RESOURCE;
654
			list[iod->npages++] = prp_list;
655 656 657
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
658 659
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
660 661 662
		dma_len -= page_size;
		dma_addr += page_size;
		length -= page_size;
663 664 665 666
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
667 668
		if (unlikely(dma_len < 0))
			goto bad_sgl;
669 670 671
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
672 673
	}

C
Chaitanya Kulkarni 已提交
674 675 676 677
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);

678 679 680
	return BLK_STS_OK;

 bad_sgl:
681 682 683
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
684
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
685 686
}

C
Chaitanya Kulkarni 已提交
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
709
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
710 711 712 713 714 715
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
716
	int i = 0;
C
Chaitanya Kulkarni 已提交
717 718 719 720

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

721
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
				return BLK_STS_RESOURCE;

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
762
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
763 764 765 766

	return BLK_STS_OK;
}

767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset;

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

	cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
	if (bv->bv_len > first_prp_len)
		cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
	return 0;
}

785 786 787 788 789 790 791 792 793 794 795
static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
		return BLK_STS_RESOURCE;
	iod->dma_len = bv->bv_len;

796
	cmnd->flags = NVME_CMD_SGL_METABUF;
797 798 799 800 801 802
	cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
	cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
	cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
	return 0;
}

803
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
804
		struct nvme_command *cmnd)
805
{
C
Christoph Hellwig 已提交
806
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
807
	blk_status_t ret = BLK_STS_RESOURCE;
808
	int nr_mapped;
809

810 811 812 813 814 815 816
	if (blk_rq_nr_phys_segments(req) == 1) {
		struct bio_vec bv = req_bvec(req);

		if (!is_pci_p2pdma_page(bv.bv_page)) {
			if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
				return nvme_setup_prp_simple(dev, req,
							     &cmnd->rw, &bv);
817 818 819 820 821

			if (iod->nvmeq->qid &&
			    dev->ctrl.sgls & ((1 << 0) | (1 << 1)))
				return nvme_setup_sgl_simple(dev, req,
							     &cmnd->rw, &bv);
822 823 824 825
		}
	}

	iod->dma_len = 0;
826 827 828
	iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
	if (!iod->sg)
		return BLK_STS_RESOURCE;
829
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
830
	iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
C
Christoph Hellwig 已提交
831 832
	if (!iod->nents)
		goto out;
833

834 835
	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
836
					      rq_dma_dir(req));
837 838
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
839
					     rq_dma_dir(req), DMA_ATTR_NO_WARN);
840
	if (!nr_mapped)
C
Christoph Hellwig 已提交
841
		goto out;
842

843
	iod->use_sgl = nvme_pci_use_sgls(dev, req);
844
	if (iod->use_sgl)
845
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
846 847
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
848
out:
849
	if (ret != BLK_STS_OK)
850 851 852
		nvme_unmap_data(dev, req);
	return ret;
}
853

854 855 856 857
static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
		struct nvme_command *cmnd)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
858

859 860 861 862 863 864
	iod->meta_dma = dma_map_bvec(dev->dev, rq_integrity_vec(req),
			rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->meta_dma))
		return BLK_STS_IOERR;
	cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
	return 0;
M
Matthew Wilcox 已提交
865 866
}

867 868 869
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
870
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
871
			 const struct blk_mq_queue_data *bd)
872
{
M
Matias Bjørling 已提交
873 874
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
875
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
876
	struct request *req = bd->rq;
877
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
878
	struct nvme_command cmnd;
879
	blk_status_t ret;
K
Keith Busch 已提交
880

881 882 883 884
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;

885 886 887 888
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
889
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
890 891
		return BLK_STS_IOERR;

892
	ret = nvme_setup_cmd(ns, req, &cmnd);
893
	if (ret)
C
Christoph Hellwig 已提交
894
		return ret;
M
Matias Bjørling 已提交
895

896
	if (blk_rq_nr_phys_segments(req)) {
897
		ret = nvme_map_data(dev, req, &cmnd);
898
		if (ret)
899
			goto out_free_cmd;
900
	}
M
Matias Bjørling 已提交
901

902 903 904 905 906 907
	if (blk_integrity_rq(req)) {
		ret = nvme_map_metadata(dev, req, &cmnd);
		if (ret)
			goto out_unmap_data;
	}

908
	blk_mq_start_request(req);
909
	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
910
	return BLK_STS_OK;
911 912
out_unmap_data:
	nvme_unmap_data(dev, req);
913 914
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
915
	return ret;
M
Matthew Wilcox 已提交
916
}
K
Keith Busch 已提交
917

918
static void nvme_pci_complete_rq(struct request *req)
919
{
C
Christoph Hellwig 已提交
920
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
921
	struct nvme_dev *dev = iod->nvmeq->dev;
M
Matias Bjørling 已提交
922

923
	nvme_cleanup_cmd(req);
924 925 926
	if (blk_integrity_rq(req))
		dma_unmap_page(dev->dev, iod->meta_dma,
			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
927
	if (blk_rq_nr_phys_segments(req))
928
		nvme_unmap_data(dev, req);
929
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
930 931
}

932
/* We read the CQE phase first to check if the rest of the entry is valid */
933
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
934
{
935 936
	return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
			nvmeq->cq_phase;
937 938
}

939
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
940
{
941
	u16 head = nvmeq->cq_head;
942

943 944 945
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
946
}
947

948
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
949
{
950
	volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
951
	struct request *req;
952

953 954 955 956 957
	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
			cqe->command_id, le16_to_cpu(cqe->sq_id));
		return;
M
Matthew Wilcox 已提交
958 959
	}

960 961 962 963 964 965 966
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
	if (unlikely(nvmeq->qid == 0 &&
K
Keith Busch 已提交
967
			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
968 969
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
970
		return;
971
	}
M
Matthew Wilcox 已提交
972

973
	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
Y
yupeng 已提交
974
	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
975 976
	nvme_end_request(req, cqe->status, cqe->result);
}
M
Matthew Wilcox 已提交
977

978
static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
M
Matthew Wilcox 已提交
979
{
980 981 982 983 984 985
	while (start != end) {
		nvme_handle_cqe(nvmeq, start);
		if (++start == nvmeq->q_depth)
			start = 0;
	}
}
986

987 988
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
989
	if (nvmeq->cq_head == nvmeq->q_depth - 1) {
990 991
		nvmeq->cq_head = 0;
		nvmeq->cq_phase = !nvmeq->cq_phase;
992 993
	} else {
		nvmeq->cq_head++;
M
Matthew Wilcox 已提交
994
	}
J
Jens Axboe 已提交
995 996
}

997 998
static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
				  u16 *end, unsigned int tag)
J
Jens Axboe 已提交
999
{
1000
	int found = 0;
M
Matthew Wilcox 已提交
1001

1002
	*start = nvmeq->cq_head;
1003 1004 1005
	while (nvme_cqe_pending(nvmeq)) {
		if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
			found++;
1006
		nvme_update_cq_head(nvmeq);
1007
	}
1008
	*end = nvmeq->cq_head;
1009

1010
	if (*start != *end)
1011
		nvme_ring_cq_doorbell(nvmeq);
1012
	return found;
M
Matthew Wilcox 已提交
1013 1014 1015
}

static irqreturn_t nvme_irq(int irq, void *data)
1016 1017
{
	struct nvme_queue *nvmeq = data;
1018
	irqreturn_t ret = IRQ_NONE;
1019 1020
	u16 start, end;

1021 1022 1023 1024 1025
	/*
	 * The rmb/wmb pair ensures we see all updates from a previous run of
	 * the irq handler, even if that was on another CPU.
	 */
	rmb();
1026 1027
	if (nvmeq->cq_head != nvmeq->last_cq_head)
		ret = IRQ_HANDLED;
1028
	nvme_process_cq(nvmeq, &start, &end, -1);
1029
	nvmeq->last_cq_head = nvmeq->cq_head;
1030
	wmb();
1031

1032 1033 1034 1035 1036 1037
	if (start != end) {
		nvme_complete_cqes(nvmeq, start, end);
		return IRQ_HANDLED;
	}

	return ret;
1038 1039 1040 1041 1042
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1043
	if (nvme_cqe_pending(nvmeq))
1044 1045
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1046 1047
}

1048 1049 1050 1051 1052
/*
 * Poll for completions any queue, including those not dedicated to polling.
 * Can be called from any context.
 */
static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
J
Jens Axboe 已提交
1053
{
1054
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
1055
	u16 start, end;
1056
	int found;
J
Jens Axboe 已提交
1057

1058 1059 1060 1061 1062
	/*
	 * For a poll queue we need to protect against the polling thread
	 * using the CQ lock.  For normal interrupt driven threads we have
	 * to disable the interrupt to avoid racing with it.
	 */
1063
	if (test_bit(NVMEQ_POLLED, &nvmeq->flags)) {
1064
		spin_lock(&nvmeq->cq_poll_lock);
1065
		found = nvme_process_cq(nvmeq, &start, &end, tag);
1066
		spin_unlock(&nvmeq->cq_poll_lock);
1067 1068 1069
	} else {
		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
		found = nvme_process_cq(nvmeq, &start, &end, tag);
1070
		enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1071
	}
1072

1073
	nvme_complete_cqes(nvmeq, start, end);
1074
	return found;
J
Jens Axboe 已提交
1075 1076
}

1077
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1078 1079 1080 1081 1082 1083 1084 1085
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	u16 start, end;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1086
	spin_lock(&nvmeq->cq_poll_lock);
1087
	found = nvme_process_cq(nvmeq, &start, &end, -1);
1088
	spin_unlock(&nvmeq->cq_poll_lock);
1089 1090 1091 1092 1093

	nvme_complete_cqes(nvmeq, start, end);
	return found;
}

1094
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1095
{
1096
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1097
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1098
	struct nvme_command c;
M
Matthew Wilcox 已提交
1099

M
Matias Bjørling 已提交
1100 1101
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1102
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1103
	nvme_submit_cmd(nvmeq, &c, true);
1104 1105
}

M
Matthew Wilcox 已提交
1106
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1107
{
M
Matthew Wilcox 已提交
1108 1109 1110 1111 1112 1113
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1114
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1115 1116 1117
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1118
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1119 1120
{
	struct nvme_command c;
J
Jens Axboe 已提交
1121 1122
	int flags = NVME_QUEUE_PHYS_CONTIG;

1123
	if (!test_bit(NVMEQ_POLLED, &nvmeq->flags))
J
Jens Axboe 已提交
1124
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1125

1126
	/*
M
Minwoo Im 已提交
1127
	 * Note: we (ab)use the fact that the prp fields survive if no data
1128 1129
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1130 1131 1132 1133 1134 1135
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
1136
	c.create_cq.irq_vector = cpu_to_le16(vector);
M
Matthew Wilcox 已提交
1137

1138
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1139 1140 1141 1142 1143
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1144
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1145
	struct nvme_command c;
1146
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1147

1148 1149 1150 1151 1152 1153 1154 1155
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1156
	/*
M
Minwoo Im 已提交
1157
	 * Note: we (ab)use the fact that the prp fields survive if no data
1158 1159
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1160 1161 1162 1163 1164 1165 1166 1167
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1168
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1181
static void abort_endio(struct request *req, blk_status_t error)
1182
{
C
Christoph Hellwig 已提交
1183 1184
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1185

1186 1187
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1188 1189
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1190 1191
}

K
Keith Busch 已提交
1192 1193 1194 1195 1196 1197 1198 1199
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{

	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1200 1201 1202
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1203
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1204
		return false;
1205 1206 1207
	default:
		break;
	}
K
Keith Busch 已提交
1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1236
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1237
{
C
Christoph Hellwig 已提交
1238 1239
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1240
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1241 1242
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1243 1244
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1245 1246 1247 1248 1249 1250 1251
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1252 1253 1254 1255 1256 1257
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1258
		nvme_reset_ctrl(&dev->ctrl);
1259
		return BLK_EH_DONE;
K
Keith Busch 已提交
1260
	}
K
Keith Busch 已提交
1261

K
Keith Busch 已提交
1262 1263 1264
	/*
	 * Did we miss an interrupt?
	 */
1265
	if (nvme_poll_irqdisable(nvmeq, req->tag)) {
K
Keith Busch 已提交
1266 1267 1268
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1269
		return BLK_EH_DONE;
K
Keith Busch 已提交
1270 1271
	}

1272
	/*
1273 1274 1275
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1276
	 * shutdown, so we return BLK_EH_DONE.
1277
	 */
1278 1279
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
1280 1281 1282
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
		/* fall through */
	case NVME_CTRL_DELETING:
1283
		dev_warn_ratelimited(dev->ctrl.device,
1284 1285
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1286
		nvme_dev_disable(dev, true);
1287
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1288
		return BLK_EH_DONE;
1289 1290
	case NVME_CTRL_RESETTING:
		return BLK_EH_RESET_TIMER;
1291 1292
	default:
		break;
K
Keith Busch 已提交
1293 1294
	}

1295 1296 1297 1298
	/*
 	 * Shutdown the controller immediately and schedule a reset if the
 	 * command was already aborted once before and still hasn't been
 	 * returned to the driver, or if this is the admin queue.
1299
	 */
C
Christoph Hellwig 已提交
1300
	if (!nvmeq->qid || iod->aborted) {
1301
		dev_warn(dev->ctrl.device,
1302 1303
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1304
		nvme_dev_disable(dev, false);
1305
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1306

1307
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1308
		return BLK_EH_DONE;
K
Keith Busch 已提交
1309 1310
	}

1311
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1312
		atomic_inc(&dev->ctrl.abort_limit);
1313
		return BLK_EH_RESET_TIMER;
1314
	}
1315
	iod->aborted = 1;
M
Matias Bjørling 已提交
1316

K
Keith Busch 已提交
1317 1318
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1319
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1320 1321
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1322 1323 1324
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1325 1326

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1327
			BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
1328 1329 1330 1331 1332 1333 1334 1335
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->timeout = ADMIN_TIMEOUT;
	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1336

1337 1338 1339 1340 1341 1342
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1343 1344
}

M
Matias Bjørling 已提交
1345 1346
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1347
	dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth),
1348
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1349 1350
	if (!nvmeq->sq_cmds)
		return;
1351

1352
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
1353
		pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev),
1354 1355
				nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
	} else {
1356
		dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth),
1357
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1358
	}
1359 1360
}

1361
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1362 1363 1364
{
	int i;

1365 1366
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1367
		nvme_free_queue(&dev->queues[i]);
1368
	}
1369 1370
}

K
Keith Busch 已提交
1371 1372
/**
 * nvme_suspend_queue - put queue into suspended state
1373
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1374 1375
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1376
{
1377
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1378
		return 1;
1379

1380
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1381
	mb();
1382

1383
	nvmeq->dev->online_queues--;
1384
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1385
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1386 1387
	if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
		pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
K
Keith Busch 已提交
1388 1389
	return 0;
}
M
Matthew Wilcox 已提交
1390

1391 1392 1393 1394 1395 1396 1397 1398
static void nvme_suspend_io_queues(struct nvme_dev *dev)
{
	int i;

	for (i = dev->ctrl.queue_count - 1; i > 0; i--)
		nvme_suspend_queue(&dev->queues[i]);
}

1399
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1400
{
1401
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1402

1403 1404 1405
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1406
		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1407

1408
	nvme_poll_irqdisable(nvmeq, -1);
M
Matthew Wilcox 已提交
1409 1410
}

1411 1412 1413 1414
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1415 1416
	unsigned q_size_aligned = roundup(q_depth * entry_size,
					  dev->ctrl.page_size);
1417 1418

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1419
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1420
		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
1421
		q_depth = div_u64(mem_per_q, entry_size);
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
				int qid, int depth)
{
1438 1439 1440 1441 1442 1443
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
		nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
						nvmeq->sq_cmds);
1444 1445 1446 1447
		if (nvmeq->sq_dma_addr) {
			set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
			return 0; 
		}
1448
	}
1449

1450 1451
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1452 1453
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1454 1455 1456
	return 0;
}

1457
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1458
{
1459
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1460

1461 1462
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1463

1464 1465
	nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth),
					 &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1466 1467 1468
	if (!nvmeq->cqes)
		goto free_nvmeq;

1469
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
M
Matthew Wilcox 已提交
1470 1471
		goto free_cqdma;

M
Matthew Wilcox 已提交
1472
	nvmeq->dev = dev;
1473
	spin_lock_init(&nvmeq->sq_lock);
1474
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1475
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1476
	nvmeq->cq_phase = 1;
1477
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
M
Matthew Wilcox 已提交
1478
	nvmeq->q_depth = depth;
K
Keith Busch 已提交
1479
	nvmeq->qid = qid;
1480
	dev->ctrl.queue_count++;
1481

1482
	return 0;
M
Matthew Wilcox 已提交
1483 1484

 free_cqdma:
1485
	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
M
Matthew Wilcox 已提交
1486 1487
							nvmeq->cq_dma_addr);
 free_nvmeq:
1488
	return -ENOMEM;
M
Matthew Wilcox 已提交
1489 1490
}

1491
static int queue_request_irq(struct nvme_queue *nvmeq)
1492
{
1493 1494 1495 1496 1497 1498 1499 1500 1501 1502
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1503 1504
}

1505
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1506
{
1507
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1508

1509
	nvmeq->sq_tail = 0;
1510
	nvmeq->last_sq_tail = 0;
1511 1512
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1513
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1514
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1515
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1516
	dev->online_queues++;
1517
	wmb(); /* ensure the first interrupt sees the initialization */
1518 1519
}

J
Jens Axboe 已提交
1520
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1521 1522 1523
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1524
	u16 vector = 0;
1525

1526 1527
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1528 1529 1530 1531
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1532 1533 1534
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
1535
		set_bit(NVMEQ_POLLED, &nvmeq->flags);
J
Jens Axboe 已提交
1536

1537
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1538 1539
	if (result)
		return result;
M
Matthew Wilcox 已提交
1540 1541 1542

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1543 1544
		return result;
	else if (result)
M
Matthew Wilcox 已提交
1545 1546
		goto release_cq;

1547
	nvmeq->cq_vector = vector;
1548
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1549

1550 1551
	if (!polled) {
		nvmeq->cq_vector = vector;
J
Jens Axboe 已提交
1552 1553 1554 1555
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1556

1557
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1558
	return result;
M
Matthew Wilcox 已提交
1559

1560
release_sq:
1561
	dev->online_queues--;
M
Matthew Wilcox 已提交
1562
	adapter_delete_sq(dev, qid);
1563
release_cq:
M
Matthew Wilcox 已提交
1564
	adapter_delete_cq(dev, qid);
1565
	return result;
M
Matthew Wilcox 已提交
1566 1567
}

1568
static const struct blk_mq_ops nvme_mq_admin_ops = {
1569
	.queue_rq	= nvme_queue_rq,
1570
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1571
	.init_hctx	= nvme_admin_init_hctx,
1572
	.exit_hctx      = nvme_admin_exit_hctx,
1573
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1574 1575 1576
	.timeout	= nvme_timeout,
};

1577
static const struct blk_mq_ops nvme_mq_ops = {
1578 1579 1580 1581 1582 1583 1584 1585
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1586 1587
};

1588 1589
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1590
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1591 1592 1593 1594 1595
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1596
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1597
		blk_cleanup_queue(dev->ctrl.admin_q);
1598 1599 1600 1601
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1602 1603
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1604
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1605 1606
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1607

K
Keith Busch 已提交
1608
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
M
Matias Bjørling 已提交
1609
		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1610
		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1611
		dev->admin_tagset.cmd_size = sizeof(struct nvme_iod);
1612
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1613 1614 1615 1616
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1617
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1618

1619 1620
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1621 1622 1623
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1624
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1625
			nvme_dev_remove_admin(dev);
1626
			dev->ctrl.admin_q = NULL;
1627 1628
			return -ENODEV;
		}
K
Keith Busch 已提交
1629
	} else
1630
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1631 1632 1633 1634

	return 0;
}

1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1661
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1662
{
1663
	int result;
M
Matthew Wilcox 已提交
1664 1665 1666
	u32 aqa;
	struct nvme_queue *nvmeq;

1667 1668 1669 1670
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1671
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1672
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1673

1674 1675 1676
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1677

1678
	result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1679 1680
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1681

1682
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1683 1684
	if (result)
		return result;
M
Matthew Wilcox 已提交
1685

1686
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1687 1688 1689
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1690 1691 1692
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1693

1694
	result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
1695
	if (result)
K
Keith Busch 已提交
1696
		return result;
M
Matias Bjørling 已提交
1697

K
Keith Busch 已提交
1698
	nvmeq->cq_vector = 0;
1699
	nvme_init_queue(nvmeq, 0);
1700
	result = queue_request_irq(nvmeq);
1701
	if (result) {
1702
		dev->online_queues--;
K
Keith Busch 已提交
1703
		return result;
1704
	}
1705

1706
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1707 1708 1709
	return result;
}

1710
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1711
{
J
Jens Axboe 已提交
1712
	unsigned i, max, rw_queues;
1713
	int ret = 0;
K
Keith Busch 已提交
1714

1715
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1716
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1717
			ret = -ENOMEM;
K
Keith Busch 已提交
1718
			break;
1719 1720
		}
	}
K
Keith Busch 已提交
1721

1722
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1723 1724 1725
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1726 1727 1728 1729
	} else {
		rw_queues = max;
	}

1730
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1731 1732 1733
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1734
		if (ret)
K
Keith Busch 已提交
1735
			break;
M
Matthew Wilcox 已提交
1736
	}
1737 1738 1739

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1740 1741
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1742 1743 1744
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1745 1746
}

1747 1748 1749 1750 1751 1752
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1753
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1754 1755 1756 1757
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1758
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1759
{
1760 1761 1762 1763 1764 1765 1766 1767 1768 1769
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1770
static void nvme_map_cmb(struct nvme_dev *dev)
1771
{
1772
	u64 size, offset;
1773 1774
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1775
	int bar;
1776

1777 1778 1779
	if (dev->cmb_size)
		return;

1780
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1781 1782
	if (!dev->cmbsz)
		return;
1783
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1784

1785 1786
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1787 1788
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1789 1790

	if (offset > bar_size)
1791
		return;
1792 1793 1794 1795 1796 1797 1798 1799 1800

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1801 1802 1803
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1804
		return;
1805 1806
	}

1807
	dev->cmb_size = size;
1808 1809 1810 1811 1812
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1813 1814 1815 1816 1817

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1818 1819 1820 1821
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1822
	if (dev->cmb_size) {
1823 1824
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1825
		dev->cmb_size = 0;
1826 1827 1828
	}
}

1829 1830
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1831
	u64 dma_addr = dev->host_mem_descs_dma;
1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
	c.features.dword12	= cpu_to_le32(dev->host_mem_size >>
					      ilog2(dev->ctrl.page_size));
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
		size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;

1862 1863 1864
		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
			       le64_to_cpu(desc->addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1865 1866 1867 1868
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1869 1870 1871
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1872
	dev->host_mem_descs = NULL;
1873
	dev->nr_host_mem_descs = 0;
1874 1875
}

1876 1877
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1878
{
1879
	struct nvme_host_mem_buf_desc *descs;
1880
	u32 max_entries, len;
1881
	dma_addr_t descs_dma;
1882
	int i = 0;
1883
	void **bufs;
1884
	u64 size, tmp;
1885 1886 1887 1888

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1889 1890 1891 1892

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1893 1894
	descs = dma_alloc_coherent(dev->dev, max_entries * sizeof(*descs),
				   &descs_dma, GFP_KERNEL);
1895 1896 1897 1898 1899 1900 1901
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1902
	for (size = 0; size < preferred && i < max_entries; size += len) {
1903 1904
		dma_addr_t dma_addr;

1905
		len = min_t(u64, chunk_size, preferred - size);
1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
		descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
		i++;
	}

1916
	if (!size)
1917 1918 1919 1920 1921
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1922
	dev->host_mem_descs_dma = descs_dma;
1923 1924 1925 1926 1927 1928 1929
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
		size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;

1930 1931 1932
		dma_free_attrs(dev->dev, size, bufs[i],
			       le64_to_cpu(descs[i].addr),
			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
1933 1934 1935 1936
	}

	kfree(bufs);
out_free_descs:
1937 1938
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1939 1940 1941 1942 1943
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1944 1945 1946 1947 1948
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
	u32 chunk_size;

	/* start big and work our way down */
1949
	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
1950
	     chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
	     chunk_size /= 2) {
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

1962
static int nvme_setup_host_mem(struct nvme_dev *dev)
1963 1964 1965 1966 1967
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
1968
	int ret;
1969 1970 1971 1972 1973 1974 1975

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
1976
		return 0;
1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
1990 1991 1992
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
1993
			return 0; /* controller must work without HMB */
1994 1995 1996 1997 1998
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
1999 2000
	}

2001 2002
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2003
		nvme_free_host_mem(dev);
2004
	return ret;
K
Keith Busch 已提交
2005 2006
}

2007 2008 2009 2010 2011
/*
 * nirqs is the number of interrupts available for write and read
 * queues. The core already reserved an interrupt for the admin queue.
 */
static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
2012
{
2013 2014
	struct nvme_dev *dev = affd->priv;
	unsigned int nr_read_queues;
2015 2016

	/*
2017 2018 2019 2020 2021 2022 2023 2024 2025
	 * If there is no interupt available for queues, ensure that
	 * the default queue is set to 1. The affinity set size is
	 * also set to one, but the irq core ignores it for this case.
	 *
	 * If only one interrupt is available or 'write_queue' == 0, combine
	 * write and read queues.
	 *
	 * If 'write_queues' > 0, ensure it leaves room for at least one read
	 * queue.
2026
	 */
2027 2028 2029 2030 2031 2032 2033
	if (!nrirqs) {
		nrirqs = 1;
		nr_read_queues = 0;
	} else if (nrirqs == 1 || !write_queues) {
		nr_read_queues = 0;
	} else if (write_queues >= nrirqs) {
		nr_read_queues = 1;
2034
	} else {
2035
		nr_read_queues = nrirqs - write_queues;
2036
	}
2037 2038 2039 2040 2041 2042

	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
	affd->nr_sets = nr_read_queues ? 2 : 1;
2043 2044
}

2045
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2046 2047 2048
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	struct irq_affinity affd = {
2049
		.pre_vectors	= 1,
2050 2051
		.calc_sets	= nvme_calc_irq_sets,
		.priv		= dev,
2052
	};
2053
	unsigned int irq_queues, this_p_queues;
2054
	unsigned int nr_cpus = num_possible_cpus();
2055 2056 2057 2058 2059 2060 2061 2062 2063 2064

	/*
	 * Poll queues don't need interrupts, but we need at least one IO
	 * queue left over for non-polled IO.
	 */
	this_p_queues = poll_queues;
	if (this_p_queues >= nr_io_queues) {
		this_p_queues = nr_io_queues - 1;
		irq_queues = 1;
	} else {
2065 2066 2067 2068
		if (nr_cpus < nr_io_queues - this_p_queues)
			irq_queues = nr_cpus + 1;
		else
			irq_queues = nr_io_queues - this_p_queues + 1;
2069 2070
	}
	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
2071

2072 2073 2074
	/* Initialize for the single interrupt case */
	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
	dev->io_queues[HCTX_TYPE_READ] = 0;
2075

2076 2077
	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
2078 2079
}

2080 2081 2082 2083 2084 2085
static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
		__nvme_disable_io_queues(dev, nvme_admin_delete_cq);
}

2086
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2087
{
2088
	struct nvme_queue *adminq = &dev->queues[0];
2089
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2090 2091
	int result, nr_io_queues;
	unsigned long size;
M
Matthew Wilcox 已提交
2092

2093
	nr_io_queues = max_io_queues();
C
Christoph Hellwig 已提交
2094 2095
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2096
		return result;
C
Christoph Hellwig 已提交
2097

2098
	if (nr_io_queues == 0)
2099
		return 0;
2100 2101
	
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2102

2103
	if (dev->cmb_use_sqes) {
2104 2105 2106 2107 2108
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2109
			dev->cmb_use_sqes = false;
2110 2111
	}

2112 2113 2114 2115 2116 2117 2118 2119 2120
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2121

2122
 retry:
K
Keith Busch 已提交
2123
	/* Deregister the admin queue's interrupt */
2124
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2125

2126 2127 2128 2129
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2130
	pci_free_irq_vectors(pdev);
2131 2132

	result = nvme_setup_irqs(dev, nr_io_queues);
2133
	if (result <= 0)
2134
		return -EIO;
2135

2136
	dev->num_vecs = result;
J
Jens Axboe 已提交
2137
	result = max(result - 1, 1);
2138
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2139

2140 2141 2142 2143 2144 2145
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
2146
	result = queue_request_irq(adminq);
2147
	if (result)
K
Keith Busch 已提交
2148
		return result;
2149
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165

	result = nvme_create_io_queues(dev);
	if (result || dev->online_queues < 2)
		return result;

	if (dev->online_queues - 1 < dev->max_qid) {
		nr_io_queues = dev->online_queues - 1;
		nvme_disable_io_queues(dev);
		nvme_suspend_io_queues(dev);
		goto retry;
	}
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
	return 0;
M
Matthew Wilcox 已提交
2166 2167
}

2168
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2169
{
K
Keith Busch 已提交
2170
	struct nvme_queue *nvmeq = req->end_io_data;
2171

K
Keith Busch 已提交
2172
	blk_mq_free_request(req);
2173
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2174 2175
}

2176
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2177
{
K
Keith Busch 已提交
2178
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2179

2180 2181
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2182 2183

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2184 2185
}

K
Keith Busch 已提交
2186
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2187
{
K
Keith Busch 已提交
2188 2189 2190
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2191

K
Keith Busch 已提交
2192 2193 2194
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2195

2196
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
K
Keith Busch 已提交
2197 2198
	if (IS_ERR(req))
		return PTR_ERR(req);
2199

K
Keith Busch 已提交
2200 2201 2202
	req->timeout = ADMIN_TIMEOUT;
	req->end_io_data = nvmeq;

2203
	init_completion(&nvmeq->delete_done);
K
Keith Busch 已提交
2204 2205 2206 2207
	blk_execute_rq_nowait(q, NULL, req, false,
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2208 2209
}

2210
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2211
{
2212
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2213
	unsigned long timeout;
K
Keith Busch 已提交
2214

K
Keith Busch 已提交
2215
 retry:
2216 2217 2218 2219 2220 2221
	timeout = ADMIN_TIMEOUT;
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2222
	}
2223 2224 2225 2226
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2227 2228 2229
				timeout);
		if (timeout == 0)
			return false;
2230 2231 2232 2233 2234 2235 2236

		/* handle any remaining CQEs */
		if (opcode == nvme_admin_delete_cq &&
		    !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
			nvme_poll_irqdisable(nvmeq, -1);

		sent--;
2237 2238 2239 2240
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2241 2242
}

2243
/*
2244
 * return error value only when tagset allocation failed
2245
 */
2246
static int nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2247
{
2248 2249
	int ret;

2250
	if (!dev->ctrl.tagset) {
2251
		dev->tagset.ops = &nvme_mq_ops;
2252
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2253 2254 2255
		dev->tagset.nr_maps = 2; /* default + read */
		if (dev->io_queues[HCTX_TYPE_POLL])
			dev->tagset.nr_maps++;
2256 2257 2258
		dev->tagset.timeout = NVME_IO_TIMEOUT;
		dev->tagset.numa_node = dev_to_node(dev->dev);
		dev->tagset.queue_depth =
M
Matias Bjørling 已提交
2259
				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
2260
		dev->tagset.cmd_size = sizeof(struct nvme_iod);
2261 2262
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2263

2264 2265 2266 2267 2268 2269
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
			return ret;
		}
2270
		dev->ctrl.tagset = &dev->tagset;
2271 2272 2273 2274 2275
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2276
	}
2277

2278
	nvme_dbbuf_set(dev);
K
Keith Busch 已提交
2279
	return 0;
M
Matthew Wilcox 已提交
2280 2281
}

2282
static int nvme_pci_enable(struct nvme_dev *dev)
2283
{
2284
	int result = -ENOMEM;
2285
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2286 2287 2288 2289 2290 2291

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2292 2293
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
2294
		goto disable;
2295

2296
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2297
		result = -ENODEV;
2298
		goto disable;
K
Keith Busch 已提交
2299
	}
2300 2301

	/*
2302 2303 2304
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2305
	 */
2306 2307 2308
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2309

2310
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2311

2312
	dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2313
				io_queue_depth);
2314
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2315
	dev->dbs = dev->bar + 4096;
2316 2317 2318 2319 2320 2321 2322

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2323 2324
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2325
			dev->q_depth);
2326 2327
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2328
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2329 2330 2331
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2332 2333
	}

2334
	nvme_map_cmb(dev);
2335

K
Keith Busch 已提交
2336 2337
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2338 2339 2340 2341 2342 2343 2344 2345
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2346 2347 2348
{
	if (dev->bar)
		iounmap(dev->bar);
2349
	pci_release_mem_regions(to_pci_dev(dev->dev));
2350 2351 2352
}

static void nvme_pci_disable(struct nvme_dev *dev)
2353
{
2354 2355
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2356
	pci_free_irq_vectors(pdev);
2357

K
Keith Busch 已提交
2358 2359
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2360
		pci_disable_device(pdev);
K
Keith Busch 已提交
2361 2362 2363
	}
}

2364
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2365
{
2366
	bool dead = true, freeze = false;
K
Keith Busch 已提交
2367
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2368

2369
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2370 2371 2372
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2373
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
2374 2375
		    dev->ctrl.state == NVME_CTRL_RESETTING) {
			freeze = true;
K
Keith Busch 已提交
2376
			nvme_start_freeze(&dev->ctrl);
2377
		}
K
Keith Busch 已提交
2378 2379
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2380
	}
2381

K
Keith Busch 已提交
2382 2383 2384 2385
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2386 2387
	if (!dead && shutdown && freeze)
		nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2388 2389

	nvme_stop_queues(&dev->ctrl);
2390

2391
	if (!dead && dev->ctrl.queue_count > 0) {
2392
		nvme_disable_io_queues(dev);
2393
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2394
	}
2395 2396
	nvme_suspend_io_queues(dev);
	nvme_suspend_queue(&dev->queues[0]);
2397
	nvme_pci_disable(dev);
2398

2399 2400
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
K
Keith Busch 已提交
2401 2402 2403 2404 2405 2406

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
2407
	if (shutdown) {
K
Keith Busch 已提交
2408
		nvme_start_queues(&dev->ctrl);
2409 2410 2411
		if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
			blk_mq_unquiesce_queue(dev->ctrl.admin_q);
	}
2412
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2413 2414
}

M
Matthew Wilcox 已提交
2415 2416
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2417
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
M
Matthew Wilcox 已提交
2418 2419 2420 2421
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

2422
	/* Optimisation for I/Os between 4k and 128k */
2423
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2424 2425 2426 2427 2428
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2429 2430 2431 2432 2433 2434
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2435
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2436 2437
}

2438
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2439
{
2440
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2441

2442
	nvme_dbbuf_dma_free(dev);
2443
	put_device(dev->dev);
2444 2445
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
2446 2447
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2448
	kfree(dev->queues);
2449
	free_opal_dev(dev->ctrl.opal_dev);
2450
	mempool_destroy(dev->iod_mempool);
2451 2452 2453
	kfree(dev);
}

2454
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2455
{
2456
	nvme_get_ctrl(&dev->ctrl);
2457
	nvme_dev_disable(dev, false);
2458
	nvme_kill_queues(&dev->ctrl);
2459
	if (!queue_work(nvme_wq, &dev->remove_work))
2460 2461 2462
		nvme_put_ctrl(&dev->ctrl);
}

2463
static void nvme_reset_work(struct work_struct *work)
2464
{
2465 2466
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2467
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2468
	int result;
2469
	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
2470

2471 2472
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
		result = -ENODEV;
2473
		goto out;
2474
	}
2475

2476 2477 2478 2479
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2480
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2481
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2482
	nvme_sync_queues(&dev->ctrl);
2483

2484
	mutex_lock(&dev->shutdown_lock);
2485
	result = nvme_pci_enable(dev);
2486
	if (result)
2487
		goto out_unlock;
2488

2489
	result = nvme_pci_configure_admin_queue(dev);
2490
	if (result)
2491
		goto out_unlock;
2492

K
Keith Busch 已提交
2493 2494
	result = nvme_alloc_admin_tags(dev);
	if (result)
2495
		goto out_unlock;
2496

2497 2498 2499 2500 2501 2502
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
	dev->ctrl.max_segments = NVME_MAX_SEGS;
2503 2504 2505 2506 2507 2508

	/*
	 * Don't limit the IOMMU merged segment size.
	 */
	dma_set_max_seg_size(dev->dev, 0xffffffff);

2509 2510 2511 2512 2513 2514 2515 2516 2517
	mutex_unlock(&dev->shutdown_lock);

	/*
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
	 * initializing procedure here.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller CONNECTING\n");
2518
		result = -EBUSY;
2519 2520
		goto out;
	}
2521

2522 2523
	result = nvme_init_identify(&dev->ctrl);
	if (result)
2524
		goto out;
2525

2526 2527 2528 2529 2530 2531 2532 2533 2534
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2535
	}
2536

2537 2538 2539 2540 2541 2542 2543
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2544 2545 2546 2547 2548
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2549

2550
	result = nvme_setup_io_queues(dev);
2551
	if (result)
2552
		goto out;
2553

2554 2555 2556 2557
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2558
	if (dev->online_queues < 2) {
2559
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2560
		nvme_kill_queues(&dev->ctrl);
2561
		nvme_remove_namespaces(&dev->ctrl);
2562
		new_state = NVME_CTRL_ADMIN_ONLY;
2563
	} else {
2564
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2565
		nvme_wait_freeze(&dev->ctrl);
2566 2567 2568
		/* hit this only when allocate tagset fails */
		if (nvme_dev_add(dev))
			new_state = NVME_CTRL_ADMIN_ONLY;
K
Keith Busch 已提交
2569
		nvme_unfreeze(&dev->ctrl);
2570 2571
	}

2572 2573 2574 2575 2576 2577 2578
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller state %d\n", new_state);
2579
		result = -ENODEV;
2580 2581
		goto out;
	}
2582

2583
	nvme_start_ctrl(&dev->ctrl);
2584
	return;
2585

2586 2587
 out_unlock:
	mutex_unlock(&dev->shutdown_lock);
2588
 out:
2589 2590 2591 2592
	if (result)
		dev_warn(dev->ctrl.device,
			 "Removing after probe failure status: %d\n", result);
	nvme_remove_dead_ctrl(dev);
2593 2594
}

2595
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2596
{
2597
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2598
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2599 2600

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2601
		device_release_driver(&pdev->dev);
2602
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2603 2604
}

2605
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2606
{
2607
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2608
	return 0;
T
Tejun Heo 已提交
2609 2610
}

2611
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2612
{
2613 2614 2615
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2616

2617 2618 2619 2620
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
	*val = readq(to_nvme_dev(ctrl)->bar + off);
	return 0;
2621 2622
}

2623 2624 2625 2626 2627 2628 2629
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
}

2630
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2631
	.name			= "pcie",
2632
	.module			= THIS_MODULE,
2633 2634
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2635
	.reg_read32		= nvme_pci_reg_read32,
2636
	.reg_write32		= nvme_pci_reg_write32,
2637
	.reg_read64		= nvme_pci_reg_read64,
2638
	.free_ctrl		= nvme_pci_free_ctrl,
2639
	.submit_async_event	= nvme_pci_submit_async_event,
2640
	.get_address		= nvme_pci_get_address,
2641
};
2642

2643 2644 2645 2646
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2647
	if (pci_request_mem_regions(pdev, "nvme"))
2648 2649
		return -ENODEV;

2650
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2651 2652
		goto release;

M
Max Gurtovoy 已提交
2653
	return 0;
2654
  release:
M
Max Gurtovoy 已提交
2655 2656
	pci_release_mem_regions(pdev);
	return -ENODEV;
2657 2658
}

2659
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2674 2675 2676
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2677 2678 2679
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2680 2681
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2682 2683
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2684
			return NVME_QUIRK_NO_APST;
2685 2686 2687 2688 2689
	}

	return 0;
}

2690 2691 2692
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2693

2694 2695
	nvme_reset_ctrl_sync(&dev->ctrl);
	flush_work(&dev->ctrl.scan_work);
2696
	nvme_put_ctrl(&dev->ctrl);
2697 2698
}

2699
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2700
{
M
Matias Bjørling 已提交
2701
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2702
	struct nvme_dev *dev;
2703
	unsigned long quirks = id->driver_data;
2704
	size_t alloc_size;
M
Matthew Wilcox 已提交
2705

M
Matias Bjørling 已提交
2706 2707
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2708
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2709 2710

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2711 2712
	if (!dev)
		return -ENOMEM;
2713

2714 2715
	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
					GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2716 2717 2718
	if (!dev->queues)
		goto free;

2719
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2720
	pci_set_drvdata(pdev, dev);
2721

2722 2723
	result = nvme_dev_map(dev);
	if (result)
2724
		goto put_pci;
2725

2726
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2727
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2728
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2729

M
Matthew Wilcox 已提交
2730 2731
	result = nvme_setup_prp_pools(dev);
	if (result)
2732
		goto unmap;
2733

2734
	quirks |= check_vendor_combination_bug(pdev);
2735

2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
						NVME_MAX_SEGS, true);
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2753 2754 2755 2756 2757
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2758 2759
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2760
	nvme_get_ctrl(&dev->ctrl);
2761
	async_schedule(nvme_async_probe, dev);
2762

M
Matthew Wilcox 已提交
2763 2764
	return 0;

2765 2766
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2767
 release_pools:
M
Matthew Wilcox 已提交
2768
	nvme_release_prp_pools(dev);
2769 2770
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2771
 put_pci:
2772
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2773 2774 2775 2776 2777 2778
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2779
static void nvme_reset_prepare(struct pci_dev *pdev)
2780
{
K
Keith Busch 已提交
2781
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2782
	nvme_dev_disable(dev, false);
2783
}
2784

2785 2786
static void nvme_reset_done(struct pci_dev *pdev)
{
2787
	struct nvme_dev *dev = pci_get_drvdata(pdev);
S
Sagi Grimberg 已提交
2788
	nvme_reset_ctrl_sync(&dev->ctrl);
2789 2790
}

2791 2792 2793
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2794
	nvme_dev_disable(dev, true);
2795 2796
}

2797 2798 2799 2800 2801
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
2802
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2803 2804
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2805

2806
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
2807
	pci_set_drvdata(pdev, NULL);
2808

2809
	if (!pci_device_is_present(pdev)) {
2810
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2811
		nvme_dev_disable(dev, true);
2812
		nvme_dev_remove_admin(dev);
2813
	}
2814

2815
	flush_work(&dev->ctrl.reset_work);
2816 2817
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
2818
	nvme_dev_disable(dev, true);
2819
	nvme_release_cmb(dev);
2820
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
2821
	nvme_dev_remove_admin(dev);
2822
	nvme_free_queues(dev, 0);
2823
	nvme_uninit_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2824
	nvme_release_prp_pools(dev);
2825
	nvme_dev_unmap(dev);
2826
	nvme_put_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2827 2828
}

2829
#ifdef CONFIG_PM_SLEEP
2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850
static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
{
	return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
}

static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
{
	return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
}

static int nvme_resume(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
	struct nvme_ctrl *ctrl = &ndev->ctrl;

	if (pm_resume_via_firmware() || !ctrl->npss ||
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
		nvme_reset_ctrl(ctrl);
	return 0;
}

2851 2852 2853 2854
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);
2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911
	struct nvme_ctrl *ctrl = &ndev->ctrl;
	int ret = -EBUSY;

	/*
	 * The platform does not remove power for a kernel managed suspend so
	 * use host managed nvme power settings for lowest idle power if
	 * possible. This should have quicker resume latency than a full device
	 * shutdown.  But if the firmware is involved after the suspend or the
	 * device does not support any non-default power states, shut down the
	 * device fully.
	 */
	if (pm_suspend_via_firmware() || !ctrl->npss) {
		nvme_dev_disable(ndev, true);
		return 0;
	}

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

	if (ctrl->state != NVME_CTRL_LIVE &&
	    ctrl->state != NVME_CTRL_ADMIN_ONLY)
		goto unfreeze;

	ndev->last_ps = 0;
	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
	if (ret < 0)
		goto unfreeze;

	ret = nvme_set_power_state(ctrl, ctrl->npss);
	if (ret < 0)
		goto unfreeze;

	if (ret) {
		/*
		 * Clearing npss forces a controller reset on resume. The
		 * correct value will be resdicovered then.
		 */
		nvme_dev_disable(ndev, true);
		ctrl->npss = 0;
		ret = 0;
		goto unfreeze;
	}
	/*
	 * A saved state prevents pci pm from generically controlling the
	 * device's power. If we're using protocol specific settings, we don't
	 * want pci interfering.
	 */
	pci_save_state(pdev);
unfreeze:
	nvme_unfreeze(ctrl);
	return ret;
}

static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
2912

2913
	nvme_dev_disable(ndev, true);
2914 2915 2916
	return 0;
}

2917
static int nvme_simple_resume(struct device *dev)
2918 2919 2920 2921
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2922
	nvme_reset_ctrl(&ndev->ctrl);
K
Keith Busch 已提交
2923
	return 0;
2924 2925
}

2926 2927 2928 2929 2930 2931 2932 2933 2934
const struct dev_pm_ops nvme_dev_pm_ops = {
	.suspend	= nvme_suspend,
	.resume		= nvme_resume,
	.freeze		= nvme_simple_suspend,
	.thaw		= nvme_simple_resume,
	.poweroff	= nvme_simple_suspend,
	.restore	= nvme_simple_resume,
};
#endif /* CONFIG_PM_SLEEP */
M
Matthew Wilcox 已提交
2935

K
Keith Busch 已提交
2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
2950 2951
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
2952
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2953 2954
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
2955 2956
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
2957 2958 2959 2960 2961 2962 2963 2964 2965
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

2966
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
2967
	pci_restore_state(pdev);
2968
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2969 2970 2971 2972 2973
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
2974 2975 2976
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
2977 2978
}

2979
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
2980 2981 2982
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
2983 2984
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
2985 2986
};

2987
static const struct pci_device_id nvme_id_table[] = {
2988
	{ PCI_VDEVICE(INTEL, 0x0953),
2989
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2990
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2991 2992
	{ PCI_VDEVICE(INTEL, 0x0a53),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2993
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2994 2995
	{ PCI_VDEVICE(INTEL, 0x0a54),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2996
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2997 2998 2999
	{ PCI_VDEVICE(INTEL, 0x0a55),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
3000
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
3001 3002
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_MEDIUM_PRIO_SQ },
3003 3004
	{ PCI_VDEVICE(INTEL, 0xf1a6),	/* Intel 760p/Pro 7600p */
		.driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
3005
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
3006 3007
		.driver_data = NVME_QUIRK_IDENTIFY_CNS |
				NVME_QUIRK_DISABLE_WRITE_ZEROES, },
3008 3009
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3010 3011
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3012 3013
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3014 3015
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
3016 3017 3018 3019
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
C
Christoph Hellwig 已提交
3020 3021 3022 3023
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
3024 3025
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
M
Matthew Wilcox 已提交
3026
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
3027
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
3028
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
M
Matthew Wilcox 已提交
3029 3030 3031 3032 3033 3034 3035 3036
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
3037
	.remove		= nvme_remove,
3038
	.shutdown	= nvme_shutdown,
3039
#ifdef CONFIG_PM_SLEEP
3040 3041 3042
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
3043
#endif
3044
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
3045 3046 3047 3048 3049
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
3050 3051 3052
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
3053
	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
3054
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
3055 3056 3057 3058 3059
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3060
	flush_workqueue(nvme_wq);
M
Matthew Wilcox 已提交
3061 3062 3063 3064
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3065
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3066 3067
module_init(nvme_init);
module_exit(nvme_exit);