pci.c 75.9 KB
Newer Older
M
Matthew Wilcox 已提交
1 2
/*
 * NVM Express device driver
3
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
4 5 6 7 8 9 10 11 12 13 14
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

K
Keith Busch 已提交
15
#include <linux/aer.h>
16
#include <linux/async.h>
M
Matthew Wilcox 已提交
17
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
18
#include <linux/blk-mq.h>
19
#include <linux/blk-mq-pci.h>
20
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
21 22 23 24 25
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
26
#include <linux/mutex.h>
27
#include <linux/once.h>
M
Matthew Wilcox 已提交
28
#include <linux/pci.h>
K
Keith Busch 已提交
29
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
30
#include <linux/types.h>
31
#include <linux/io-64-nonatomic-lo-hi.h>
32
#include <linux/sed-opal.h>
33
#include <linux/pci-p2pdma.h>
34

35 36
#include "nvme.h"

M
Matthew Wilcox 已提交
37 38
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
39

C
Chaitanya Kulkarni 已提交
40
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
41

42 43 44 45 46 47 48
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

49 50 51
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

52
static bool use_cmb_sqes = true;
53
module_param(use_cmb_sqes, bool, 0444);
54 55
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

56 57 58 59
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
60

C
Chaitanya Kulkarni 已提交
61 62 63 64 65 66
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

67 68 69 70 71 72 73 74 75 76
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
	.get = param_get_int,
};

static int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

77 78 79 80 81 82 83 84 85 86 87 88
static int queue_count_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops queue_count_ops = {
	.set = queue_count_set,
	.get = param_get_int,
};

static int write_queues;
module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

J
Jens Axboe 已提交
89
static int poll_queues = 0;
J
Jens Axboe 已提交
90 91 92
module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

93 94
struct nvme_dev;
struct nvme_queue;
95

96
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
97

98 99 100 101
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
102
	struct nvme_queue *queues;
103 104 105 106 107 108 109 110
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
111
	unsigned io_queues[HCTX_MAX_TYPES];
112
	unsigned int num_vecs;
113 114 115
	int q_depth;
	u32 db_stride;
	void __iomem *bar;
116
	unsigned long bar_mapped_size;
117
	struct work_struct remove_work;
118
	struct mutex shutdown_lock;
119 120
	bool subsystem;
	u64 cmb_size;
121
	bool cmb_use_sqes;
122
	u32 cmbsz;
123
	u32 cmbloc;
124
	struct nvme_ctrl ctrl;
125

126 127
	mempool_t *iod_mempool;

128
	/* shadow doorbell buffer support: */
129 130 131 132
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
133 134 135 136

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
137
	dma_addr_t host_mem_descs_dma;
138 139
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
K
Keith Busch 已提交
140
};
141

142 143 144 145 146 147 148 149 150 151 152
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
	int n = 0, ret;

	ret = kstrtoint(val, 10, &n);
	if (ret != 0 || n < 2)
		return -EINVAL;

	return param_set_int(val, kp);
}

153 154 155 156 157 158 159 160 161 162 163
static int queue_count_set(const char *val, const struct kernel_param *kp)
{
	int n = 0, ret;

	ret = kstrtoint(val, 10, &n);
	if (n > num_possible_cpus())
		n = num_possible_cpus();

	return param_set_int(val, kp);
}

164 165 166 167 168 169 170 171 172 173
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

174 175 176 177 178
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
179 180 181 182 183 184
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
	struct device *q_dmadev;
M
Matthew Wilcox 已提交
185
	struct nvme_dev *dev;
186
	spinlock_t sq_lock;
M
Matthew Wilcox 已提交
187
	struct nvme_command *sq_cmds;
188 189
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
M
Matthew Wilcox 已提交
190
	volatile struct nvme_completion *cqes;
191
	struct blk_mq_tags **tags;
M
Matthew Wilcox 已提交
192 193 194 195
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
J
Jens Axboe 已提交
196
	s16 cq_vector;
M
Matthew Wilcox 已提交
197
	u16 sq_tail;
198
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
199
	u16 cq_head;
200
	u16 last_cq_head;
K
Keith Busch 已提交
201
	u16 qid;
202
	u8 cq_phase;
203 204
	unsigned long flags;
#define NVMEQ_ENABLED		0
205
#define NVMEQ_SQ_CMB		1
206
#define NVMEQ_DELETE_ERROR	2
207 208 209 210
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
211
	struct completion delete_done;
M
Matthew Wilcox 已提交
212 213
};

214 215 216
/*
 * The nvme_iod describes the data in an I/O, including the list of PRP
 * entries.  You can't see it in this data structure because C doesn't let
C
Christoph Hellwig 已提交
217
 * me express that.  Use nvme_init_iod to ensure there's enough space
218 219 220
 * allocated to store the PRP list.
 */
struct nvme_iod {
221
	struct nvme_request req;
C
Christoph Hellwig 已提交
222
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
223
	bool use_sgl;
C
Christoph Hellwig 已提交
224
	int aborted;
225 226 227 228
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	int length;		/* Of data, in bytes */
	dma_addr_t first_dma;
229
	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
C
Christoph Hellwig 已提交
230 231
	struct scatterlist *sg;
	struct scatterlist inline_sg[0];
M
Matthew Wilcox 已提交
232 233 234 235 236 237 238 239 240 241 242 243
};

/*
 * Check we didin't inadvertently grow the command struct
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
244
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
K
Keith Busch 已提交
245
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
M
Matthew Wilcox 已提交
246
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
247 248
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
M
Matthew Wilcox 已提交
249
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
K
Keith Busch 已提交
250
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
251 252 253
	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
}

254 255
static unsigned int max_io_queues(void)
{
J
Jens Axboe 已提交
256
	return num_possible_cpus() + write_queues + poll_queues;
257 258 259 260 261 262 263 264
}

static unsigned int max_queue_count(void)
{
	/* IO queues + admin queue */
	return 1 + max_io_queues();
}

265 266
static inline unsigned int nvme_dbbuf_size(u32 stride)
{
267
	return (max_queue_count() * 8 * stride);
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
336
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

363 364 365 366 367 368 369 370
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

371 372 373 374 375
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
376 377
}

378 379 380 381
/*
 * Max size of iod being embedded in the request payload
 */
#define NVME_INT_PAGES		2
382
#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
383 384 385 386 387 388 389 390

/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
391 392
	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				      dev->ctrl.page_size);
393 394 395
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
396 397 398 399 400
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
static int nvme_pci_npages_sgl(unsigned int num_seg)
401
{
C
Chaitanya Kulkarni 已提交
402
	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
C
Christoph Hellwig 已提交
403
}
404

C
Chaitanya Kulkarni 已提交
405 406
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
		unsigned int size, unsigned int nseg, bool use_sgl)
C
Christoph Hellwig 已提交
407
{
C
Chaitanya Kulkarni 已提交
408 409 410 411 412 413 414 415
	size_t alloc_size;

	if (use_sgl)
		alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
	else
		alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);

	return alloc_size + sizeof(struct scatterlist) * nseg;
C
Christoph Hellwig 已提交
416
}
417

C
Chaitanya Kulkarni 已提交
418
static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
C
Christoph Hellwig 已提交
419
{
C
Chaitanya Kulkarni 已提交
420 421 422 423 424
	unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
				    NVME_INT_BYTES(dev), NVME_INT_PAGES,
				    use_sgl);

	return sizeof(struct nvme_iod) + alloc_size;
425 426
}

M
Matias Bjørling 已提交
427 428
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
429
{
M
Matias Bjørling 已提交
430
	struct nvme_dev *dev = data;
431
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
432

433 434 435 436
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
	WARN_ON(nvmeq->tags);

M
Matias Bjørling 已提交
437
	hctx->driver_data = nvmeq;
438
	nvmeq->tags = &dev->admin_tagset.tags[0];
M
Matias Bjørling 已提交
439
	return 0;
440 441
}

442 443 444 445 446 447 448
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	nvmeq->tags = NULL;
}

M
Matias Bjørling 已提交
449 450
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
451
{
M
Matias Bjørling 已提交
452
	struct nvme_dev *dev = data;
453
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
454

455 456
	if (!nvmeq->tags)
		nvmeq->tags = &dev->tagset.tags[hctx_idx];
M
Matthew Wilcox 已提交
457

458
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
459 460
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
461 462
}

463 464
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
465
{
466
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
467
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
468
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
469
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
470 471

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
472
	iod->nvmeq = nvmeq;
473 474

	nvme_req(req)->ctrl = &dev->ctrl;
M
Matias Bjørling 已提交
475 476 477
	return 0;
}

478 479 480 481 482 483 484 485 486
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

487 488 489
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
490 491 492 493 494 495 496 497
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
498
			BUG_ON(i == HCTX_TYPE_DEFAULT);
499
			continue;
500 501
		}

J
Jens Axboe 已提交
502 503 504 505
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
506
		map->queue_offset = qoff;
507
		if (i != HCTX_TYPE_POLL)
J
Jens Axboe 已提交
508 509 510
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
511 512 513 514 515
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
516 517
}

518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
	nvmeq->last_sq_tail = nvmeq->sq_tail;
}

M
Matthew Wilcox 已提交
538
/**
539
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
540 541
 * @nvmeq: The queue to use
 * @cmd: The command to send
542
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
543
 */
544 545
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
546
{
547
	spin_lock(&nvmeq->sq_lock);
548
	memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
549 550
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
551 552 553 554 555 556 557 558 559 560 561
	nvme_write_sq_db(nvmeq, write_sq);
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
562
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
563 564
}

C
Chaitanya Kulkarni 已提交
565
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
566
{
C
Christoph Hellwig 已提交
567
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
568
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
569 570
}

571 572 573
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
574
	int nseg = blk_rq_nr_phys_segments(req);
575 576
	unsigned int avg_seg_size;

577 578 579 580
	if (nseg == 0)
		return false;

	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
581 582 583 584 585 586 587 588 589 590

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

591
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
592
{
C
Christoph Hellwig 已提交
593
	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
594
	int nseg = blk_rq_nr_phys_segments(rq);
595
	unsigned int size = blk_rq_payload_bytes(rq);
596

597 598
	iod->use_sgl = nvme_pci_use_sgls(dev, rq);

C
Christoph Hellwig 已提交
599
	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
600
		iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
C
Christoph Hellwig 已提交
601
		if (!iod->sg)
602
			return BLK_STS_RESOURCE;
C
Christoph Hellwig 已提交
603 604
	} else {
		iod->sg = iod->inline_sg;
605 606
	}

C
Christoph Hellwig 已提交
607 608 609 610
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;
	iod->length = size;
K
Keith Busch 已提交
611

612
	return BLK_STS_OK;
613 614
}

C
Christoph Hellwig 已提交
615
static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
616
{
C
Christoph Hellwig 已提交
617
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
618 619 620
	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;

621 622 623
	int i;

	if (iod->npages == 0)
C
Chaitanya Kulkarni 已提交
624 625 626
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			dma_addr);

627
	for (i = 0; i < iod->npages; i++) {
C
Chaitanya Kulkarni 已提交
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
		void *addr = nvme_pci_iod_list(req)[i];

		if (iod->use_sgl) {
			struct nvme_sgl_desc *sg_list = addr;

			next_dma_addr =
			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
		} else {
			__le64 *prp_list = addr;

			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
		}

		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
		dma_addr = next_dma_addr;
643
	}
644

C
Christoph Hellwig 已提交
645
	if (iod->sg != iod->inline_sg)
646
		mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
647 648
}

649 650 651 652 653 654 655 656 657 658 659 660 661 662
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
663 664
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
665
{
C
Christoph Hellwig 已提交
666
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
667
	struct dma_pool *pool;
668
	int length = blk_rq_payload_bytes(req);
669
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
670 671
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
672
	u32 page_size = dev->ctrl.page_size;
673
	int offset = dma_addr & (page_size - 1);
674
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
675
	void **list = nvme_pci_iod_list(req);
676
	dma_addr_t prp_dma;
677
	int nprps, i;
M
Matthew Wilcox 已提交
678

679
	length -= (page_size - offset);
680 681
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
682
		goto done;
683
	}
M
Matthew Wilcox 已提交
684

685
	dma_len -= (page_size - offset);
M
Matthew Wilcox 已提交
686
	if (dma_len) {
687
		dma_addr += (page_size - offset);
M
Matthew Wilcox 已提交
688 689 690 691 692 693
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

694
	if (length <= page_size) {
695
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
696
		goto done;
697 698
	}

699
	nprps = DIV_ROUND_UP(length, page_size);
700 701
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
702
		iod->npages = 0;
703 704
	} else {
		pool = dev->prp_page_pool;
705
		iod->npages = 1;
706 707
	}

708
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
709
	if (!prp_list) {
710
		iod->first_dma = dma_addr;
711
		iod->npages = -1;
712
		return BLK_STS_RESOURCE;
713
	}
714 715
	list[0] = prp_list;
	iod->first_dma = prp_dma;
716 717
	i = 0;
	for (;;) {
718
		if (i == page_size >> 3) {
719
			__le64 *old_prp_list = prp_list;
720
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
721
			if (!prp_list)
722
				return BLK_STS_RESOURCE;
723
			list[iod->npages++] = prp_list;
724 725 726
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
727 728
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
729 730 731
		dma_len -= page_size;
		dma_addr += page_size;
		length -= page_size;
732 733 734 735
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
736 737
		if (unlikely(dma_len < 0))
			goto bad_sgl;
738 739 740
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
741 742
	}

C
Chaitanya Kulkarni 已提交
743 744 745 746
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);

747 748 749
	return BLK_STS_OK;

 bad_sgl:
750 751 752
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
753
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
754 755
}

C
Chaitanya Kulkarni 已提交
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
778
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
779 780 781 782 783 784
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
785
	int i = 0;
C
Chaitanya Kulkarni 已提交
786 787 788 789

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

790
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
				return BLK_STS_RESOURCE;

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
831
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
832 833 834 835

	return BLK_STS_OK;
}

836
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
837
		struct nvme_command *cmnd)
838
{
C
Christoph Hellwig 已提交
839
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
840 841 842
	struct request_queue *q = req->q;
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;
843
	blk_status_t ret = BLK_STS_IOERR;
844
	int nr_mapped;
845

846
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
C
Christoph Hellwig 已提交
847 848 849
	iod->nents = blk_rq_map_sg(q, req, iod->sg);
	if (!iod->nents)
		goto out;
850

851
	ret = BLK_STS_RESOURCE;
852 853 854 855 856 857 858

	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
					  dma_dir);
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
					     dma_dir,  DMA_ATTR_NO_WARN);
859
	if (!nr_mapped)
C
Christoph Hellwig 已提交
860
		goto out;
861

862
	if (iod->use_sgl)
863
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
864 865 866
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);

867
	if (ret != BLK_STS_OK)
C
Christoph Hellwig 已提交
868
		goto out_unmap;
869

870
	ret = BLK_STS_IOERR;
C
Christoph Hellwig 已提交
871 872 873
	if (blk_integrity_rq(req)) {
		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
			goto out_unmap;
874

875 876
		sg_init_table(&iod->meta_sg, 1);
		if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
C
Christoph Hellwig 已提交
877
			goto out_unmap;
878

879
		if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
C
Christoph Hellwig 已提交
880
			goto out_unmap;
M
Matthew Wilcox 已提交
881

882
		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
883 884
	}

885
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
886

C
Christoph Hellwig 已提交
887 888 889 890
out_unmap:
	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
	return ret;
M
Matthew Wilcox 已提交
891 892
}

C
Christoph Hellwig 已提交
893
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
894
{
C
Christoph Hellwig 已提交
895
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
896 897 898 899
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;

	if (iod->nents) {
900 901 902 903
		/* P2PDMA requests do not need to be unmapped */
		if (!is_pci_p2pdma_page(sg_page(iod->sg)))
			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);

904
		if (blk_integrity_rq(req))
905
			dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
906
	}
K
Keith Busch 已提交
907

908
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
909
	nvme_free_iod(dev, req);
910
}
M
Matthew Wilcox 已提交
911

912 913 914
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
915
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
916
			 const struct blk_mq_queue_data *bd)
917
{
M
Matias Bjørling 已提交
918 919
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
920
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
921
	struct request *req = bd->rq;
C
Christoph Hellwig 已提交
922
	struct nvme_command cmnd;
923
	blk_status_t ret;
K
Keith Busch 已提交
924

925 926 927 928
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
929
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
930 931
		return BLK_STS_IOERR;

932
	ret = nvme_setup_cmd(ns, req, &cmnd);
933
	if (ret)
C
Christoph Hellwig 已提交
934
		return ret;
M
Matias Bjørling 已提交
935

936
	ret = nvme_init_iod(req, dev);
937
	if (ret)
938
		goto out_free_cmd;
M
Matias Bjørling 已提交
939

940
	if (blk_rq_nr_phys_segments(req)) {
941
		ret = nvme_map_data(dev, req, &cmnd);
942 943 944
		if (ret)
			goto out_cleanup_iod;
	}
M
Matias Bjørling 已提交
945

946
	blk_mq_start_request(req);
947
	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
948
	return BLK_STS_OK;
949
out_cleanup_iod:
C
Christoph Hellwig 已提交
950
	nvme_free_iod(dev, req);
951 952
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
953
	return ret;
M
Matthew Wilcox 已提交
954
}
K
Keith Busch 已提交
955

956
static void nvme_pci_complete_rq(struct request *req)
957
{
C
Christoph Hellwig 已提交
958
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matias Bjørling 已提交
959

960 961
	nvme_unmap_data(iod->nvmeq->dev, req);
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
962 963
}

964
/* We read the CQE phase first to check if the rest of the entry is valid */
965
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
966
{
967 968
	return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
			nvmeq->cq_phase;
969 970
}

971
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
972
{
973
	u16 head = nvmeq->cq_head;
974

975 976 977
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
978
}
979

980
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
981
{
982
	volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
983
	struct request *req;
984

985 986 987 988 989
	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
			cqe->command_id, le16_to_cpu(cqe->sq_id));
		return;
M
Matthew Wilcox 已提交
990 991
	}

992 993 994 995 996 997 998
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
	if (unlikely(nvmeq->qid == 0 &&
K
Keith Busch 已提交
999
			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
1000 1001
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
1002
		return;
1003
	}
M
Matthew Wilcox 已提交
1004

1005 1006 1007
	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
	nvme_end_request(req, cqe->status, cqe->result);
}
M
Matthew Wilcox 已提交
1008

1009
static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
M
Matthew Wilcox 已提交
1010
{
1011 1012 1013 1014 1015 1016
	while (start != end) {
		nvme_handle_cqe(nvmeq, start);
		if (++start == nvmeq->q_depth)
			start = 0;
	}
}
1017

1018 1019 1020 1021 1022
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
	if (++nvmeq->cq_head == nvmeq->q_depth) {
		nvmeq->cq_head = 0;
		nvmeq->cq_phase = !nvmeq->cq_phase;
M
Matthew Wilcox 已提交
1023
	}
J
Jens Axboe 已提交
1024 1025
}

1026 1027
static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
				  u16 *end, unsigned int tag)
J
Jens Axboe 已提交
1028
{
1029
	int found = 0;
M
Matthew Wilcox 已提交
1030

1031
	*start = nvmeq->cq_head;
1032 1033 1034
	while (nvme_cqe_pending(nvmeq)) {
		if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
			found++;
1035
		nvme_update_cq_head(nvmeq);
1036
	}
1037
	*end = nvmeq->cq_head;
1038

1039
	if (*start != *end)
1040
		nvme_ring_cq_doorbell(nvmeq);
1041
	return found;
M
Matthew Wilcox 已提交
1042 1043 1044
}

static irqreturn_t nvme_irq(int irq, void *data)
1045 1046
{
	struct nvme_queue *nvmeq = data;
1047
	irqreturn_t ret = IRQ_NONE;
1048 1049
	u16 start, end;

1050 1051 1052 1053 1054
	/*
	 * The rmb/wmb pair ensures we see all updates from a previous run of
	 * the irq handler, even if that was on another CPU.
	 */
	rmb();
1055 1056
	if (nvmeq->cq_head != nvmeq->last_cq_head)
		ret = IRQ_HANDLED;
1057
	nvme_process_cq(nvmeq, &start, &end, -1);
1058
	nvmeq->last_cq_head = nvmeq->cq_head;
1059
	wmb();
1060

1061 1062 1063 1064 1065 1066
	if (start != end) {
		nvme_complete_cqes(nvmeq, start, end);
		return IRQ_HANDLED;
	}

	return ret;
1067 1068 1069 1070 1071
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1072
	if (nvme_cqe_pending(nvmeq))
1073 1074
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1075 1076
}

1077 1078 1079 1080 1081
/*
 * Poll for completions any queue, including those not dedicated to polling.
 * Can be called from any context.
 */
static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
J
Jens Axboe 已提交
1082
{
1083
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
1084
	u16 start, end;
1085
	int found;
J
Jens Axboe 已提交
1086

1087 1088 1089 1090 1091 1092 1093 1094 1095
	/*
	 * For a poll queue we need to protect against the polling thread
	 * using the CQ lock.  For normal interrupt driven threads we have
	 * to disable the interrupt to avoid racing with it.
	 */
	if (nvmeq->cq_vector == -1)
		spin_lock(&nvmeq->cq_poll_lock);
	else
		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1096
	found = nvme_process_cq(nvmeq, &start, &end, tag);
1097 1098 1099 1100
	if (nvmeq->cq_vector == -1)
		spin_unlock(&nvmeq->cq_poll_lock);
	else
		enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1101

1102
	nvme_complete_cqes(nvmeq, start, end);
1103
	return found;
J
Jens Axboe 已提交
1104 1105
}

1106
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1107 1108 1109 1110 1111 1112 1113 1114
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	u16 start, end;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1115
	spin_lock(&nvmeq->cq_poll_lock);
1116
	found = nvme_process_cq(nvmeq, &start, &end, -1);
1117
	spin_unlock(&nvmeq->cq_poll_lock);
1118 1119 1120 1121 1122

	nvme_complete_cqes(nvmeq, start, end);
	return found;
}

1123
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1124
{
1125
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1126
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1127
	struct nvme_command c;
M
Matthew Wilcox 已提交
1128

M
Matias Bjørling 已提交
1129 1130
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1131
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1132
	nvme_submit_cmd(nvmeq, &c, true);
1133 1134
}

M
Matthew Wilcox 已提交
1135
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1136
{
M
Matthew Wilcox 已提交
1137 1138 1139 1140 1141 1142
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1143
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1144 1145 1146
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1147
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1148 1149
{
	struct nvme_command c;
J
Jens Axboe 已提交
1150 1151 1152 1153
	int flags = NVME_QUEUE_PHYS_CONTIG;

	if (vector != -1)
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1154

1155
	/*
M
Minwoo Im 已提交
1156
	 * Note: we (ab)use the fact that the prp fields survive if no data
1157 1158
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1159 1160 1161 1162 1163 1164
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
J
Jens Axboe 已提交
1165 1166 1167 1168
	if (vector != -1)
		c.create_cq.irq_vector = cpu_to_le16(vector);
	else
		c.create_cq.irq_vector = 0;
M
Matthew Wilcox 已提交
1169

1170
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1171 1172 1173 1174 1175
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1176
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1177
	struct nvme_command c;
1178
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1179

1180 1181 1182 1183 1184 1185 1186 1187
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1188
	/*
M
Minwoo Im 已提交
1189
	 * Note: we (ab)use the fact that the prp fields survive if no data
1190 1191
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1192 1193 1194 1195 1196 1197 1198 1199
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1200
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1213
static void abort_endio(struct request *req, blk_status_t error)
1214
{
C
Christoph Hellwig 已提交
1215 1216
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1217

1218 1219
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1220 1221
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1222 1223
}

K
Keith Busch 已提交
1224 1225 1226 1227 1228 1229 1230 1231
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{

	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1232 1233 1234
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1235
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1236
		return false;
1237 1238 1239
	default:
		break;
	}
K
Keith Busch 已提交
1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1268
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1269
{
C
Christoph Hellwig 已提交
1270 1271
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1272
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1273 1274
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1275 1276
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1277 1278 1279 1280 1281 1282 1283
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1284 1285 1286 1287 1288 1289
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1290
		nvme_reset_ctrl(&dev->ctrl);
1291
		return BLK_EH_DONE;
K
Keith Busch 已提交
1292
	}
K
Keith Busch 已提交
1293

K
Keith Busch 已提交
1294 1295 1296
	/*
	 * Did we miss an interrupt?
	 */
1297
	if (nvme_poll_irqdisable(nvmeq, req->tag)) {
K
Keith Busch 已提交
1298 1299 1300
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1301
		return BLK_EH_DONE;
K
Keith Busch 已提交
1302 1303
	}

1304
	/*
1305 1306 1307
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1308
	 * shutdown, so we return BLK_EH_DONE.
1309
	 */
1310 1311 1312
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
	case NVME_CTRL_RESETTING:
1313
		dev_warn_ratelimited(dev->ctrl.device,
1314 1315
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1316
		nvme_dev_disable(dev, false);
1317
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1318
		return BLK_EH_DONE;
1319 1320
	default:
		break;
K
Keith Busch 已提交
1321 1322
	}

1323 1324 1325 1326
	/*
 	 * Shutdown the controller immediately and schedule a reset if the
 	 * command was already aborted once before and still hasn't been
 	 * returned to the driver, or if this is the admin queue.
1327
	 */
C
Christoph Hellwig 已提交
1328
	if (!nvmeq->qid || iod->aborted) {
1329
		dev_warn(dev->ctrl.device,
1330 1331
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1332
		nvme_dev_disable(dev, false);
1333
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1334

1335
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1336
		return BLK_EH_DONE;
K
Keith Busch 已提交
1337 1338
	}

1339
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1340
		atomic_inc(&dev->ctrl.abort_limit);
1341
		return BLK_EH_RESET_TIMER;
1342
	}
1343
	iod->aborted = 1;
M
Matias Bjørling 已提交
1344

K
Keith Busch 已提交
1345 1346
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1347
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1348 1349
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1350 1351 1352
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1353 1354

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1355
			BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
1356 1357 1358 1359 1360 1361 1362 1363
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->timeout = ADMIN_TIMEOUT;
	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1364

1365 1366 1367 1368 1369 1370
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1371 1372
}

M
Matias Bjørling 已提交
1373 1374
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1375 1376
	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1377 1378
	if (!nvmeq->sq_cmds)
		return;
1379

1380 1381 1382 1383 1384 1385
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
		pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
				nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
	} else {
		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1386
	}
1387 1388
}

1389
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1390 1391 1392
{
	int i;

1393 1394
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1395
		nvme_free_queue(&dev->queues[i]);
1396
	}
1397 1398
}

K
Keith Busch 已提交
1399 1400
/**
 * nvme_suspend_queue - put queue into suspended state
1401
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1402 1403
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1404
{
1405
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1406
		return 1;
1407

1408
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1409
	mb();
1410

1411
	nvmeq->dev->online_queues--;
1412
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1413
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1414 1415 1416 1417
	if (nvmeq->cq_vector == -1)
		return 0;
	pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
	nvmeq->cq_vector = -1;
K
Keith Busch 已提交
1418 1419
	return 0;
}
M
Matthew Wilcox 已提交
1420

1421
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1422
{
1423
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1424

1425 1426 1427
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1428
		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1429

1430
	nvme_poll_irqdisable(nvmeq, -1);
M
Matthew Wilcox 已提交
1431 1432
}

1433 1434 1435 1436
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1437 1438
	unsigned q_size_aligned = roundup(q_depth * entry_size,
					  dev->ctrl.page_size);
1439 1440

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1441
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1442
		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
1443
		q_depth = div_u64(mem_per_q, entry_size);
1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
				int qid, int depth)
{
1460 1461 1462 1463 1464 1465
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
		nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
						nvmeq->sq_cmds);
1466 1467 1468 1469
		if (nvmeq->sq_dma_addr) {
			set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
			return 0; 
		}
1470
	}
1471

1472 1473
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1474 1475
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1476 1477 1478
	return 0;
}

1479
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1480
{
1481
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1482

1483 1484
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1485

1486
	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
J
Joe Perches 已提交
1487
					  &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1488 1489 1490
	if (!nvmeq->cqes)
		goto free_nvmeq;

1491
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
M
Matthew Wilcox 已提交
1492 1493
		goto free_cqdma;

1494
	nvmeq->q_dmadev = dev->dev;
M
Matthew Wilcox 已提交
1495
	nvmeq->dev = dev;
1496
	spin_lock_init(&nvmeq->sq_lock);
1497
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1498
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1499
	nvmeq->cq_phase = 1;
1500
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
M
Matthew Wilcox 已提交
1501
	nvmeq->q_depth = depth;
K
Keith Busch 已提交
1502
	nvmeq->qid = qid;
1503
	nvmeq->cq_vector = -1;
1504
	dev->ctrl.queue_count++;
1505

1506
	return 0;
M
Matthew Wilcox 已提交
1507 1508

 free_cqdma:
1509
	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
M
Matthew Wilcox 已提交
1510 1511
							nvmeq->cq_dma_addr);
 free_nvmeq:
1512
	return -ENOMEM;
M
Matthew Wilcox 已提交
1513 1514
}

1515
static int queue_request_irq(struct nvme_queue *nvmeq)
1516
{
1517 1518 1519 1520 1521 1522 1523 1524 1525 1526
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1527 1528
}

1529
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1530
{
1531
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1532

1533
	nvmeq->sq_tail = 0;
1534
	nvmeq->last_sq_tail = 0;
1535 1536
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1537
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1538
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1539
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1540
	dev->online_queues++;
1541
	wmb(); /* ensure the first interrupt sees the initialization */
1542 1543
}

J
Jens Axboe 已提交
1544
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1545 1546 1547
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1548
	s16 vector;
1549

1550 1551
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1552 1553 1554 1555
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1556 1557 1558 1559 1560
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
		vector = -1;

1561
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1562 1563
	if (result)
		return result;
M
Matthew Wilcox 已提交
1564 1565 1566

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1567 1568
		return result;
	else if (result)
M
Matthew Wilcox 已提交
1569 1570
		goto release_cq;

1571
	nvmeq->cq_vector = vector;
1572
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1573 1574 1575 1576 1577 1578

	if (vector != -1) {
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1579

1580
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1581
	return result;
M
Matthew Wilcox 已提交
1582

1583 1584
release_sq:
	nvmeq->cq_vector = -1;
1585
	dev->online_queues--;
M
Matthew Wilcox 已提交
1586
	adapter_delete_sq(dev, qid);
1587
release_cq:
M
Matthew Wilcox 已提交
1588
	adapter_delete_cq(dev, qid);
1589
	return result;
M
Matthew Wilcox 已提交
1590 1591
}

1592
static const struct blk_mq_ops nvme_mq_admin_ops = {
1593
	.queue_rq	= nvme_queue_rq,
1594
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1595
	.init_hctx	= nvme_admin_init_hctx,
1596
	.exit_hctx      = nvme_admin_exit_hctx,
1597
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1598 1599 1600
	.timeout	= nvme_timeout,
};

1601
static const struct blk_mq_ops nvme_mq_ops = {
1602 1603 1604 1605 1606 1607 1608 1609
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1610 1611
};

1612 1613
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1614
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1615 1616 1617 1618 1619
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1620
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1621
		blk_cleanup_queue(dev->ctrl.admin_q);
1622 1623 1624 1625
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1626 1627
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1628
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1629 1630
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1631

K
Keith Busch 已提交
1632
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
M
Matias Bjørling 已提交
1633
		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1634
		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
C
Chaitanya Kulkarni 已提交
1635
		dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
1636
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1637 1638 1639 1640
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1641
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1642

1643 1644
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1645 1646 1647
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1648
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1649
			nvme_dev_remove_admin(dev);
1650
			dev->ctrl.admin_q = NULL;
1651 1652
			return -ENODEV;
		}
K
Keith Busch 已提交
1653
	} else
1654
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1655 1656 1657 1658

	return 0;
}

1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1685
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1686
{
1687
	int result;
M
Matthew Wilcox 已提交
1688 1689 1690
	u32 aqa;
	struct nvme_queue *nvmeq;

1691 1692 1693 1694
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1695
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1696
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1697

1698 1699 1700
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1701

1702
	result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1703 1704
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1705

1706
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1707 1708
	if (result)
		return result;
M
Matthew Wilcox 已提交
1709

1710
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1711 1712 1713
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1714 1715 1716
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1717

1718
	result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
1719
	if (result)
K
Keith Busch 已提交
1720
		return result;
M
Matias Bjørling 已提交
1721

K
Keith Busch 已提交
1722
	nvmeq->cq_vector = 0;
1723
	nvme_init_queue(nvmeq, 0);
1724
	result = queue_request_irq(nvmeq);
1725 1726
	if (result) {
		nvmeq->cq_vector = -1;
K
Keith Busch 已提交
1727
		return result;
1728
	}
1729

1730
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1731 1732 1733
	return result;
}

1734
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1735
{
J
Jens Axboe 已提交
1736
	unsigned i, max, rw_queues;
1737
	int ret = 0;
K
Keith Busch 已提交
1738

1739
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1740
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1741
			ret = -ENOMEM;
K
Keith Busch 已提交
1742
			break;
1743 1744
		}
	}
K
Keith Busch 已提交
1745

1746
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1747 1748 1749
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1750 1751 1752 1753
	} else {
		rw_queues = max;
	}

1754
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1755 1756 1757
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1758
		if (ret)
K
Keith Busch 已提交
1759
			break;
M
Matthew Wilcox 已提交
1760
	}
1761 1762 1763

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1764 1765
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1766 1767 1768
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1769 1770
}

1771 1772 1773 1774 1775 1776
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1777
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1778 1779 1780 1781
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1782
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1783
{
1784 1785 1786 1787 1788 1789 1790 1791 1792 1793
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1794
static void nvme_map_cmb(struct nvme_dev *dev)
1795
{
1796
	u64 size, offset;
1797 1798
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1799
	int bar;
1800

1801 1802 1803
	if (dev->cmb_size)
		return;

1804
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1805 1806
	if (!dev->cmbsz)
		return;
1807
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1808

1809 1810
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1811 1812
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1813 1814

	if (offset > bar_size)
1815
		return;
1816 1817 1818 1819 1820 1821 1822 1823 1824

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1825 1826 1827
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1828
		return;
1829 1830
	}

1831
	dev->cmb_size = size;
1832 1833 1834 1835 1836
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1837 1838 1839 1840 1841

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1842 1843 1844 1845
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1846
	if (dev->cmb_size) {
1847 1848
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1849
		dev->cmb_size = 0;
1850 1851 1852
	}
}

1853 1854
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1855
	u64 dma_addr = dev->host_mem_descs_dma;
1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
	c.features.dword12	= cpu_to_le32(dev->host_mem_size >>
					      ilog2(dev->ctrl.page_size));
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
		size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;

		dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
				le64_to_cpu(desc->addr));
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1892 1893 1894
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1895
	dev->host_mem_descs = NULL;
1896
	dev->nr_host_mem_descs = 0;
1897 1898
}

1899 1900
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1901
{
1902
	struct nvme_host_mem_buf_desc *descs;
1903
	u32 max_entries, len;
1904
	dma_addr_t descs_dma;
1905
	int i = 0;
1906
	void **bufs;
1907
	u64 size, tmp;
1908 1909 1910 1911

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1912 1913 1914 1915

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1916 1917
	descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs),
			&descs_dma, GFP_KERNEL);
1918 1919 1920 1921 1922 1923 1924
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1925
	for (size = 0; size < preferred && i < max_entries; size += len) {
1926 1927
		dma_addr_t dma_addr;

1928
		len = min_t(u64, chunk_size, preferred - size);
1929 1930 1931 1932 1933 1934 1935 1936 1937 1938
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
		descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
		i++;
	}

1939
	if (!size)
1940 1941 1942 1943 1944
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1945
	dev->host_mem_descs_dma = descs_dma;
1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
		size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;

		dma_free_coherent(dev->dev, size, bufs[i],
				le64_to_cpu(descs[i].addr));
	}

	kfree(bufs);
out_free_descs:
1959 1960
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1961 1962 1963 1964 1965
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1966 1967 1968 1969 1970
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
	u32 chunk_size;

	/* start big and work our way down */
1971
	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
1972
	     chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
	     chunk_size /= 2) {
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

1984
static int nvme_setup_host_mem(struct nvme_dev *dev)
1985 1986 1987 1988 1989
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
1990
	int ret;
1991 1992 1993 1994 1995 1996 1997

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
1998
		return 0;
1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
2012 2013 2014
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
2015
			return 0; /* controller must work without HMB */
2016 2017 2018 2019 2020
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
2021 2022
	}

2023 2024
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2025
		nvme_free_host_mem(dev);
2026
	return ret;
K
Keith Busch 已提交
2027 2028
}

2029
static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
2030 2031 2032 2033 2034 2035
{
	unsigned int this_w_queues = write_queues;

	/*
	 * Setup read/write queue split
	 */
2036
	if (irq_queues == 1) {
2037 2038
		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
		dev->io_queues[HCTX_TYPE_READ] = 0;
2039 2040 2041 2042 2043 2044 2045
		return;
	}

	/*
	 * If 'write_queues' is set, ensure it leaves room for at least
	 * one read queue
	 */
2046 2047
	if (this_w_queues >= irq_queues)
		this_w_queues = irq_queues - 1;
2048 2049 2050 2051 2052 2053

	/*
	 * If 'write_queues' is set to zero, reads and writes will share
	 * a queue set.
	 */
	if (!this_w_queues) {
2054
		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues;
2055
		dev->io_queues[HCTX_TYPE_READ] = 0;
2056
	} else {
2057
		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
2058
		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues;
2059 2060 2061
	}
}

2062
static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
2063 2064 2065 2066 2067 2068 2069 2070
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	int irq_sets[2];
	struct irq_affinity affd = {
		.pre_vectors = 1,
		.nr_sets = ARRAY_SIZE(irq_sets),
		.sets = irq_sets,
	};
2071
	int result = 0;
2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085
	unsigned int irq_queues, this_p_queues;

	/*
	 * Poll queues don't need interrupts, but we need at least one IO
	 * queue left over for non-polled IO.
	 */
	this_p_queues = poll_queues;
	if (this_p_queues >= nr_io_queues) {
		this_p_queues = nr_io_queues - 1;
		irq_queues = 1;
	} else {
		irq_queues = nr_io_queues - this_p_queues;
	}
	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
2086 2087 2088 2089 2090 2091 2092

	/*
	 * For irq sets, we have to ask for minvec == maxvec. This passes
	 * any reduction back to us, so we can adjust our queue counts and
	 * IRQ vector needs.
	 */
	do {
2093
		nvme_calc_io_queues(dev, irq_queues);
2094 2095
		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
2096 2097 2098 2099
		if (!irq_sets[1])
			affd.nr_sets = 1;

		/*
2100 2101 2102
		 * If we got a failure and we're down to asking for just
		 * 1 + 1 queues, just ask for a single vector. We'll share
		 * that between the single IO queue and the admin queue.
2103
		 */
2104 2105
		if (result >= 0 && irq_queues > 1)
			irq_queues = irq_sets[0] + irq_sets[1] + 1;
2106

2107 2108
		result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
				irq_queues,
2109 2110 2111
				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);

		/*
2112 2113 2114 2115
		 * Need to reduce our vec counts. If we get ENOSPC, the
		 * platform should support mulitple vecs, we just need
		 * to decrease our ask. If we get EINVAL, the platform
		 * likely does not. Back down to ask for just one vector.
2116 2117
		 */
		if (result == -ENOSPC) {
2118 2119
			irq_queues--;
			if (!irq_queues)
2120 2121
				return result;
			continue;
2122
		} else if (result == -EINVAL) {
2123
			irq_queues = 1;
2124
			continue;
2125 2126 2127 2128 2129 2130 2131 2132
		} else if (result <= 0)
			return -EIO;
		break;
	} while (1);

	return result;
}

2133
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2134
{
2135
	struct nvme_queue *adminq = &dev->queues[0];
2136
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2137 2138
	int result, nr_io_queues;
	unsigned long size;
M
Matthew Wilcox 已提交
2139

2140
	nr_io_queues = max_io_queues();
C
Christoph Hellwig 已提交
2141 2142
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2143
		return result;
C
Christoph Hellwig 已提交
2144

2145
	if (nr_io_queues == 0)
2146
		return 0;
2147 2148
	
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2149

2150
	if (dev->cmb_use_sqes) {
2151 2152 2153 2154 2155
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2156
			dev->cmb_use_sqes = false;
2157 2158
	}

2159 2160 2161 2162 2163 2164 2165 2166 2167
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2168

K
Keith Busch 已提交
2169
	/* Deregister the admin queue's interrupt */
2170
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2171

2172 2173 2174 2175
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2176
	pci_free_irq_vectors(pdev);
2177 2178

	result = nvme_setup_irqs(dev, nr_io_queues);
2179
	if (result <= 0)
2180
		return -EIO;
2181

2182
	dev->num_vecs = result;
J
Jens Axboe 已提交
2183
	result = max(result - 1, 1);
2184
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2185

2186 2187 2188 2189
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
2190

2191 2192 2193 2194 2195 2196 2197
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */

2198
	result = queue_request_irq(adminq);
2199 2200
	if (result) {
		adminq->cq_vector = -1;
K
Keith Busch 已提交
2201
		return result;
2202
	}
2203
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2204
	return nvme_create_io_queues(dev);
M
Matthew Wilcox 已提交
2205 2206
}

2207
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2208
{
K
Keith Busch 已提交
2209
	struct nvme_queue *nvmeq = req->end_io_data;
2210

K
Keith Busch 已提交
2211
	blk_mq_free_request(req);
2212
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2213 2214
}

2215
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2216
{
K
Keith Busch 已提交
2217
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2218

2219 2220
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2221 2222

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2223 2224
}

K
Keith Busch 已提交
2225
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2226
{
K
Keith Busch 已提交
2227 2228 2229
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2230

K
Keith Busch 已提交
2231 2232 2233
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2234

2235
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
K
Keith Busch 已提交
2236 2237
	if (IS_ERR(req))
		return PTR_ERR(req);
2238

K
Keith Busch 已提交
2239 2240 2241
	req->timeout = ADMIN_TIMEOUT;
	req->end_io_data = nvmeq;

2242
	init_completion(&nvmeq->delete_done);
K
Keith Busch 已提交
2243 2244 2245 2246
	blk_execute_rq_nowait(q, NULL, req, false,
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2247 2248
}

2249
static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2250
{
2251
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2252
	unsigned long timeout;
K
Keith Busch 已提交
2253

K
Keith Busch 已提交
2254
 retry:
2255 2256 2257 2258 2259 2260
	timeout = ADMIN_TIMEOUT;
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2261
	}
2262 2263 2264 2265
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2266 2267 2268
				timeout);
		if (timeout == 0)
			return false;
2269 2270 2271 2272 2273 2274 2275

		/* handle any remaining CQEs */
		if (opcode == nvme_admin_delete_cq &&
		    !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
			nvme_poll_irqdisable(nvmeq, -1);

		sent--;
2276 2277 2278 2279
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2280 2281
}

2282
/*
2283
 * return error value only when tagset allocation failed
2284
 */
2285
static int nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2286
{
2287 2288
	int ret;

2289
	if (!dev->ctrl.tagset) {
2290
		dev->tagset.ops = &nvme_mq_ops;
2291
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2292
		dev->tagset.nr_maps = HCTX_MAX_TYPES;
2293 2294 2295
		dev->tagset.timeout = NVME_IO_TIMEOUT;
		dev->tagset.numa_node = dev_to_node(dev->dev);
		dev->tagset.queue_depth =
M
Matias Bjørling 已提交
2296
				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
C
Chaitanya Kulkarni 已提交
2297 2298 2299 2300 2301
		dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
		if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
			dev->tagset.cmd_size = max(dev->tagset.cmd_size,
					nvme_pci_cmd_size(dev, true));
		}
2302 2303
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2304

2305 2306 2307 2308 2309 2310
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
			return ret;
		}
2311
		dev->ctrl.tagset = &dev->tagset;
2312 2313

		nvme_dbbuf_set(dev);
2314 2315 2316 2317 2318
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2319
	}
2320

K
Keith Busch 已提交
2321
	return 0;
M
Matthew Wilcox 已提交
2322 2323
}

2324
static int nvme_pci_enable(struct nvme_dev *dev)
2325
{
2326
	int result = -ENOMEM;
2327
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2328 2329 2330 2331 2332 2333

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2334 2335
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
2336
		goto disable;
2337

2338
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2339
		result = -ENODEV;
2340
		goto disable;
K
Keith Busch 已提交
2341
	}
2342 2343

	/*
2344 2345 2346
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2347
	 */
2348 2349 2350
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2351

2352
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2353

2354
	dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2355
				io_queue_depth);
2356
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2357
	dev->dbs = dev->bar + 4096;
2358 2359 2360 2361 2362 2363 2364

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2365 2366
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2367
			dev->q_depth);
2368 2369
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2370
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2371 2372 2373
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2374 2375
	}

2376
	nvme_map_cmb(dev);
2377

K
Keith Busch 已提交
2378 2379
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2380 2381 2382 2383 2384 2385 2386 2387
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2388 2389 2390
{
	if (dev->bar)
		iounmap(dev->bar);
2391
	pci_release_mem_regions(to_pci_dev(dev->dev));
2392 2393 2394
}

static void nvme_pci_disable(struct nvme_dev *dev)
2395
{
2396 2397
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2398
	pci_free_irq_vectors(pdev);
2399

K
Keith Busch 已提交
2400 2401
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2402
		pci_disable_device(pdev);
K
Keith Busch 已提交
2403 2404 2405
	}
}

2406
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2407
{
2408
	int i;
K
Keith Busch 已提交
2409 2410
	bool dead = true;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2411

2412
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2413 2414 2415
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2416 2417
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
		    dev->ctrl.state == NVME_CTRL_RESETTING)
K
Keith Busch 已提交
2418 2419 2420
			nvme_start_freeze(&dev->ctrl);
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2421
	}
2422

K
Keith Busch 已提交
2423 2424 2425 2426
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2427 2428 2429
	if (!dead) {
		if (shutdown)
			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2430 2431 2432
	}

	nvme_stop_queues(&dev->ctrl);
2433

2434
	if (!dead && dev->ctrl.queue_count > 0) {
2435 2436
		if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
			nvme_disable_io_queues(dev, nvme_admin_delete_cq);
2437
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2438
	}
2439 2440 2441
	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
		nvme_suspend_queue(&dev->queues[i]);

2442
	nvme_pci_disable(dev);
2443

2444 2445
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
K
Keith Busch 已提交
2446 2447 2448 2449 2450 2451 2452 2453

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
	if (shutdown)
		nvme_start_queues(&dev->ctrl);
2454
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2455 2456
}

M
Matthew Wilcox 已提交
2457 2458
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2459
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
M
Matthew Wilcox 已提交
2460 2461 2462 2463
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

2464
	/* Optimisation for I/Os between 4k and 128k */
2465
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2466 2467 2468 2469 2470
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2471 2472 2473 2474 2475 2476
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2477
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2478 2479
}

2480
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2481
{
2482
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2483

2484
	nvme_dbbuf_dma_free(dev);
2485
	put_device(dev->dev);
2486 2487
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
2488 2489
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2490
	kfree(dev->queues);
2491
	free_opal_dev(dev->ctrl.opal_dev);
2492
	mempool_destroy(dev->iod_mempool);
2493 2494 2495
	kfree(dev);
}

2496 2497
static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
{
2498
	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
2499

2500
	nvme_get_ctrl(&dev->ctrl);
2501
	nvme_dev_disable(dev, false);
2502
	nvme_kill_queues(&dev->ctrl);
2503
	if (!queue_work(nvme_wq, &dev->remove_work))
2504 2505 2506
		nvme_put_ctrl(&dev->ctrl);
}

2507
static void nvme_reset_work(struct work_struct *work)
2508
{
2509 2510
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2511
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2512
	int result = -ENODEV;
2513
	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
2514

2515
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
2516
		goto out;
2517

2518 2519 2520 2521
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2522
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2523
		nvme_dev_disable(dev, false);
2524

2525
	/*
2526
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
2527 2528
	 * initializing procedure here.
	 */
2529
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
2530
		dev_warn(dev->ctrl.device,
2531
			"failed to mark controller CONNECTING\n");
2532 2533 2534
		goto out;
	}

2535
	result = nvme_pci_enable(dev);
2536
	if (result)
2537
		goto out;
2538

2539
	result = nvme_pci_configure_admin_queue(dev);
2540
	if (result)
2541
		goto out;
2542

K
Keith Busch 已提交
2543 2544
	result = nvme_alloc_admin_tags(dev);
	if (result)
2545
		goto out;
2546

2547 2548 2549 2550 2551 2552 2553
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
	dev->ctrl.max_segments = NVME_MAX_SEGS;

2554 2555
	result = nvme_init_identify(&dev->ctrl);
	if (result)
2556
		goto out;
2557

2558 2559 2560 2561 2562 2563 2564 2565 2566
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2567
	}
2568

2569 2570 2571 2572 2573 2574 2575
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2576 2577 2578 2579 2580
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2581

2582
	result = nvme_setup_io_queues(dev);
2583
	if (result)
2584
		goto out;
2585

2586 2587 2588 2589
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2590
	if (dev->online_queues < 2) {
2591
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2592
		nvme_kill_queues(&dev->ctrl);
2593
		nvme_remove_namespaces(&dev->ctrl);
2594
		new_state = NVME_CTRL_ADMIN_ONLY;
2595
	} else {
2596
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2597
		nvme_wait_freeze(&dev->ctrl);
2598 2599 2600
		/* hit this only when allocate tagset fails */
		if (nvme_dev_add(dev))
			new_state = NVME_CTRL_ADMIN_ONLY;
K
Keith Busch 已提交
2601
		nvme_unfreeze(&dev->ctrl);
2602 2603
	}

2604 2605 2606 2607 2608 2609 2610
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller state %d\n", new_state);
2611 2612
		goto out;
	}
2613

2614
	nvme_start_ctrl(&dev->ctrl);
2615
	return;
2616

2617
 out:
2618
	nvme_remove_dead_ctrl(dev, result);
2619 2620
}

2621
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2622
{
2623
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2624
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2625 2626

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2627
		device_release_driver(&pdev->dev);
2628
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2629 2630
}

2631
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2632
{
2633
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2634
	return 0;
T
Tejun Heo 已提交
2635 2636
}

2637
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2638
{
2639 2640 2641
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2642

2643 2644 2645 2646
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
	*val = readq(to_nvme_dev(ctrl)->bar + off);
	return 0;
2647 2648
}

2649 2650 2651 2652 2653 2654 2655
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
}

2656
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2657
	.name			= "pcie",
2658
	.module			= THIS_MODULE,
2659 2660
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2661
	.reg_read32		= nvme_pci_reg_read32,
2662
	.reg_write32		= nvme_pci_reg_write32,
2663
	.reg_read64		= nvme_pci_reg_read64,
2664
	.free_ctrl		= nvme_pci_free_ctrl,
2665
	.submit_async_event	= nvme_pci_submit_async_event,
2666
	.get_address		= nvme_pci_get_address,
2667
};
2668

2669 2670 2671 2672
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2673
	if (pci_request_mem_regions(pdev, "nvme"))
2674 2675
		return -ENODEV;

2676
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2677 2678
		goto release;

M
Max Gurtovoy 已提交
2679
	return 0;
2680
  release:
M
Max Gurtovoy 已提交
2681 2682
	pci_release_mem_regions(pdev);
	return -ENODEV;
2683 2684
}

2685
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2700 2701 2702
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2703 2704 2705
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2706 2707
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2708 2709
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2710
			return NVME_QUIRK_NO_APST;
2711 2712 2713 2714 2715
	}

	return 0;
}

2716 2717 2718
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2719

2720 2721
	nvme_reset_ctrl_sync(&dev->ctrl);
	flush_work(&dev->ctrl.scan_work);
2722
	nvme_put_ctrl(&dev->ctrl);
2723 2724
}

2725
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2726
{
M
Matias Bjørling 已提交
2727
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2728
	struct nvme_dev *dev;
2729
	unsigned long quirks = id->driver_data;
2730
	size_t alloc_size;
M
Matthew Wilcox 已提交
2731

M
Matias Bjørling 已提交
2732 2733
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2734
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2735 2736

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2737 2738
	if (!dev)
		return -ENOMEM;
2739

2740 2741
	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
					GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2742 2743 2744
	if (!dev->queues)
		goto free;

2745
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2746
	pci_set_drvdata(pdev, dev);
2747

2748 2749
	result = nvme_dev_map(dev);
	if (result)
2750
		goto put_pci;
2751

2752
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2753
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2754
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2755

M
Matthew Wilcox 已提交
2756 2757
	result = nvme_setup_prp_pools(dev);
	if (result)
2758
		goto unmap;
2759

2760
	quirks |= check_vendor_combination_bug(pdev);
2761

2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
						NVME_MAX_SEGS, true);
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2779 2780 2781 2782 2783
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2784 2785
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2786
	nvme_get_ctrl(&dev->ctrl);
2787
	async_schedule(nvme_async_probe, dev);
2788

M
Matthew Wilcox 已提交
2789 2790
	return 0;

2791 2792
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2793
 release_pools:
M
Matthew Wilcox 已提交
2794
	nvme_release_prp_pools(dev);
2795 2796
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2797
 put_pci:
2798
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2799 2800 2801 2802 2803 2804
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2805
static void nvme_reset_prepare(struct pci_dev *pdev)
2806
{
K
Keith Busch 已提交
2807
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2808
	nvme_dev_disable(dev, false);
2809
}
2810

2811 2812
static void nvme_reset_done(struct pci_dev *pdev)
{
2813
	struct nvme_dev *dev = pci_get_drvdata(pdev);
S
Sagi Grimberg 已提交
2814
	nvme_reset_ctrl_sync(&dev->ctrl);
2815 2816
}

2817 2818 2819
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2820
	nvme_dev_disable(dev, true);
2821 2822
}

2823 2824 2825 2826 2827
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
2828
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2829 2830
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2831

2832
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
2833
	pci_set_drvdata(pdev, NULL);
2834

2835
	if (!pci_device_is_present(pdev)) {
2836
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2837
		nvme_dev_disable(dev, true);
2838
		nvme_dev_remove_admin(dev);
2839
	}
2840

2841
	flush_work(&dev->ctrl.reset_work);
2842 2843
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
2844
	nvme_dev_disable(dev, true);
2845
	nvme_release_cmb(dev);
2846
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
2847
	nvme_dev_remove_admin(dev);
2848
	nvme_free_queues(dev, 0);
2849
	nvme_uninit_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2850
	nvme_release_prp_pools(dev);
2851
	nvme_dev_unmap(dev);
2852
	nvme_put_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2853 2854
}

2855
#ifdef CONFIG_PM_SLEEP
2856 2857 2858 2859 2860
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2861
	nvme_dev_disable(ndev, true);
2862 2863 2864 2865 2866 2867 2868 2869
	return 0;
}

static int nvme_resume(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2870
	nvme_reset_ctrl(&ndev->ctrl);
K
Keith Busch 已提交
2871
	return 0;
2872
}
2873
#endif
2874 2875

static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
M
Matthew Wilcox 已提交
2876

K
Keith Busch 已提交
2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
2891 2892
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
2893
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2894 2895
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
2896 2897
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
2898 2899 2900 2901 2902 2903 2904 2905 2906
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

2907
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
2908
	pci_restore_state(pdev);
2909
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2910 2911 2912 2913 2914
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
2915 2916 2917
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
2918 2919
}

2920
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
2921 2922 2923
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
2924 2925
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
2926 2927
};

2928
static const struct pci_device_id nvme_id_table[] = {
2929
	{ PCI_VDEVICE(INTEL, 0x0953),
2930
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2931
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2932 2933
	{ PCI_VDEVICE(INTEL, 0x0a53),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2934
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2935 2936
	{ PCI_VDEVICE(INTEL, 0x0a54),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2937
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2938 2939 2940
	{ PCI_VDEVICE(INTEL, 0x0a55),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2941
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
2942 2943
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_MEDIUM_PRIO_SQ },
2944 2945
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
2946 2947
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2948 2949
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2950 2951
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2952 2953
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2954 2955 2956 2957
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
C
Christoph Hellwig 已提交
2958 2959 2960 2961
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
2962 2963
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
M
Matthew Wilcox 已提交
2964
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
2965
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
2966
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
M
Matthew Wilcox 已提交
2967 2968 2969 2970 2971 2972 2973 2974
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
2975
	.remove		= nvme_remove,
2976
	.shutdown	= nvme_shutdown,
2977 2978 2979
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
2980
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
2981 2982 2983 2984 2985
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
2986
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
2987 2988 2989 2990 2991
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
2992
	flush_workqueue(nvme_wq);
2993
	_nvme_check_size();
M
Matthew Wilcox 已提交
2994 2995 2996 2997
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
2998
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
2999 3000
module_init(nvme_init);
module_exit(nvme_exit);