pci.c 76.2 KB
Newer Older
M
Matthew Wilcox 已提交
1 2
/*
 * NVM Express device driver
3
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
4 5 6 7 8 9 10 11 12 13 14
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

K
Keith Busch 已提交
15
#include <linux/aer.h>
16
#include <linux/async.h>
M
Matthew Wilcox 已提交
17
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
18
#include <linux/blk-mq.h>
19
#include <linux/blk-mq-pci.h>
20
#include <linux/dmi.h>
M
Matthew Wilcox 已提交
21 22 23 24 25
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/mm.h>
#include <linux/module.h>
26
#include <linux/mutex.h>
27
#include <linux/once.h>
M
Matthew Wilcox 已提交
28
#include <linux/pci.h>
K
Keith Busch 已提交
29
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
30
#include <linux/types.h>
31
#include <linux/io-64-nonatomic-lo-hi.h>
32
#include <linux/sed-opal.h>
33
#include <linux/pci-p2pdma.h>
34

35 36
#include "nvme.h"

M
Matthew Wilcox 已提交
37 38
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
39

C
Chaitanya Kulkarni 已提交
40
#define SGES_PER_PAGE	(PAGE_SIZE / sizeof(struct nvme_sgl_desc))
41

42 43 44 45 46 47 48
/*
 * These can be higher, but we need to ensure that any command doesn't
 * require an sg allocation that needs more than a page of data.
 */
#define NVME_MAX_KB_SZ	4096
#define NVME_MAX_SEGS	127

49 50 51
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

52
static bool use_cmb_sqes = true;
53
module_param(use_cmb_sqes, bool, 0444);
54 55
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

56 57 58 59
static unsigned int max_host_mem_size_mb = 128;
module_param(max_host_mem_size_mb, uint, 0444);
MODULE_PARM_DESC(max_host_mem_size_mb,
	"Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
60

C
Chaitanya Kulkarni 已提交
61 62 63 64 65 66
static unsigned int sgl_threshold = SZ_32K;
module_param(sgl_threshold, uint, 0644);
MODULE_PARM_DESC(sgl_threshold,
		"Use SGLs when average request segment size is larger or equal to "
		"this size. Use 0 to disable SGLs.");

67 68 69 70 71 72 73 74 75 76
static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops io_queue_depth_ops = {
	.set = io_queue_depth_set,
	.get = param_get_int,
};

static int io_queue_depth = 1024;
module_param_cb(io_queue_depth, &io_queue_depth_ops, &io_queue_depth, 0644);
MODULE_PARM_DESC(io_queue_depth, "set io queue depth, should >= 2");

77 78 79 80 81 82 83 84 85 86 87 88
static int queue_count_set(const char *val, const struct kernel_param *kp);
static const struct kernel_param_ops queue_count_ops = {
	.set = queue_count_set,
	.get = param_get_int,
};

static int write_queues;
module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
MODULE_PARM_DESC(write_queues,
	"Number of queues to use for writes. If not set, reads and writes "
	"will share a queue set.");

J
Jens Axboe 已提交
89
static int poll_queues = 0;
J
Jens Axboe 已提交
90 91 92
module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");

93 94
struct nvme_dev;
struct nvme_queue;
95

96
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
97

98 99 100 101
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
102
	struct nvme_queue *queues;
103 104 105 106 107 108 109 110
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned online_queues;
	unsigned max_qid;
111
	unsigned io_queues[HCTX_MAX_TYPES];
112
	unsigned int num_vecs;
113 114 115
	int q_depth;
	u32 db_stride;
	void __iomem *bar;
116
	unsigned long bar_mapped_size;
117
	struct work_struct remove_work;
118
	struct mutex shutdown_lock;
119 120
	bool subsystem;
	u64 cmb_size;
121
	bool cmb_use_sqes;
122
	u32 cmbsz;
123
	u32 cmbloc;
124
	struct nvme_ctrl ctrl;
125

126 127
	mempool_t *iod_mempool;

128
	/* shadow doorbell buffer support: */
129 130 131 132
	u32 *dbbuf_dbs;
	dma_addr_t dbbuf_dbs_dma_addr;
	u32 *dbbuf_eis;
	dma_addr_t dbbuf_eis_dma_addr;
133 134 135 136

	/* host memory buffer support: */
	u64 host_mem_size;
	u32 nr_host_mem_descs;
137
	dma_addr_t host_mem_descs_dma;
138 139
	struct nvme_host_mem_buf_desc *host_mem_descs;
	void **host_mem_desc_bufs;
K
Keith Busch 已提交
140
};
141

142 143 144 145 146 147 148 149 150 151 152
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
	int n = 0, ret;

	ret = kstrtoint(val, 10, &n);
	if (ret != 0 || n < 2)
		return -EINVAL;

	return param_set_int(val, kp);
}

153 154 155 156 157 158 159 160 161 162 163
static int queue_count_set(const char *val, const struct kernel_param *kp)
{
	int n = 0, ret;

	ret = kstrtoint(val, 10, &n);
	if (n > num_possible_cpus())
		n = num_possible_cpus();

	return param_set_int(val, kp);
}

164 165 166 167 168 169 170 171 172 173
static inline unsigned int sq_idx(unsigned int qid, u32 stride)
{
	return qid * 2 * stride;
}

static inline unsigned int cq_idx(unsigned int qid, u32 stride)
{
	return (qid * 2 + 1) * stride;
}

174 175 176 177 178
static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
179 180 181 182 183 184
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
	struct device *q_dmadev;
M
Matthew Wilcox 已提交
185
	struct nvme_dev *dev;
186
	spinlock_t sq_lock;
M
Matthew Wilcox 已提交
187
	struct nvme_command *sq_cmds;
188 189
	 /* only used for poll queues: */
	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
M
Matthew Wilcox 已提交
190
	volatile struct nvme_completion *cqes;
191
	struct blk_mq_tags **tags;
M
Matthew Wilcox 已提交
192 193 194 195
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
J
Jens Axboe 已提交
196
	s16 cq_vector;
M
Matthew Wilcox 已提交
197
	u16 sq_tail;
198
	u16 last_sq_tail;
M
Matthew Wilcox 已提交
199
	u16 cq_head;
200
	u16 last_cq_head;
K
Keith Busch 已提交
201
	u16 qid;
202
	u8 cq_phase;
203 204
	unsigned long flags;
#define NVMEQ_ENABLED		0
205
#define NVMEQ_SQ_CMB		1
206
#define NVMEQ_DELETE_ERROR	2
207 208 209 210
	u32 *dbbuf_sq_db;
	u32 *dbbuf_cq_db;
	u32 *dbbuf_sq_ei;
	u32 *dbbuf_cq_ei;
211
	struct completion delete_done;
M
Matthew Wilcox 已提交
212 213
};

214 215 216
/*
 * The nvme_iod describes the data in an I/O, including the list of PRP
 * entries.  You can't see it in this data structure because C doesn't let
C
Christoph Hellwig 已提交
217
 * me express that.  Use nvme_init_iod to ensure there's enough space
218 219 220
 * allocated to store the PRP list.
 */
struct nvme_iod {
221
	struct nvme_request req;
C
Christoph Hellwig 已提交
222
	struct nvme_queue *nvmeq;
C
Chaitanya Kulkarni 已提交
223
	bool use_sgl;
C
Christoph Hellwig 已提交
224
	int aborted;
225 226 227 228
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	int length;		/* Of data, in bytes */
	dma_addr_t first_dma;
229
	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
C
Christoph Hellwig 已提交
230 231
	struct scatterlist *sg;
	struct scatterlist inline_sg[0];
M
Matthew Wilcox 已提交
232 233 234 235 236 237 238 239 240 241 242 243
};

/*
 * Check we didin't inadvertently grow the command struct
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
244
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
K
Keith Busch 已提交
245
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
M
Matthew Wilcox 已提交
246
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
247 248
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != NVME_IDENTIFY_DATA_SIZE);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != NVME_IDENTIFY_DATA_SIZE);
M
Matthew Wilcox 已提交
249
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
K
Keith Busch 已提交
250
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
251 252 253
	BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
}

254 255
static unsigned int max_io_queues(void)
{
J
Jens Axboe 已提交
256
	return num_possible_cpus() + write_queues + poll_queues;
257 258 259 260 261 262 263 264
}

static unsigned int max_queue_count(void)
{
	/* IO queues + admin queue */
	return 1 + max_io_queues();
}

265 266
static inline unsigned int nvme_dbbuf_size(u32 stride)
{
267
	return (max_queue_count() * 8 * stride);
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
}

static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs)
		return 0;

	dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_dbs_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_dbs)
		return -ENOMEM;
	dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
					    &dev->dbbuf_eis_dma_addr,
					    GFP_KERNEL);
	if (!dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
		return -ENOMEM;
	}

	return 0;
}

static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
{
	unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);

	if (dev->dbbuf_dbs) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
		dev->dbbuf_dbs = NULL;
	}
	if (dev->dbbuf_eis) {
		dma_free_coherent(dev->dev, mem_size,
				  dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
		dev->dbbuf_eis = NULL;
	}
}

static void nvme_dbbuf_init(struct nvme_dev *dev,
			    struct nvme_queue *nvmeq, int qid)
{
	if (!dev->dbbuf_dbs || !qid)
		return;

	nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
	nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
}

static void nvme_dbbuf_set(struct nvme_dev *dev)
{
	struct nvme_command c;

	if (!dev->dbbuf_dbs)
		return;

	memset(&c, 0, sizeof(c));
	c.dbbuf.opcode = nvme_admin_dbbuf;
	c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
	c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);

	if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
336
		dev_warn(dev->ctrl.device, "unable to set dbbuf\n");
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362
		/* Free memory and continue on */
		nvme_dbbuf_dma_free(dev);
	}
}

static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
{
	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
}

/* Update dbbuf and return true if an MMIO is required */
static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
					      volatile u32 *dbbuf_ei)
{
	if (dbbuf_db) {
		u16 old_value;

		/*
		 * Ensure that the queue is written before updating
		 * the doorbell in memory
		 */
		wmb();

		old_value = *dbbuf_db;
		*dbbuf_db = value;

363 364 365 366 367 368 369 370
		/*
		 * Ensure that the doorbell is updated before reading the event
		 * index from memory.  The controller needs to provide similar
		 * ordering to ensure the envent index is updated before reading
		 * the doorbell.
		 */
		mb();

371 372 373 374 375
		if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
			return false;
	}

	return true;
M
Matthew Wilcox 已提交
376 377
}

378 379 380 381
/*
 * Max size of iod being embedded in the request payload
 */
#define NVME_INT_PAGES		2
382
#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
383 384 385 386 387 388 389 390

/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
391 392
	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				      dev->ctrl.page_size);
393 394 395
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Chaitanya Kulkarni 已提交
396 397 398 399 400
/*
 * Calculates the number of pages needed for the SGL segments. For example a 4k
 * page can accommodate 256 SGL descriptors.
 */
static int nvme_pci_npages_sgl(unsigned int num_seg)
401
{
C
Chaitanya Kulkarni 已提交
402
	return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
C
Christoph Hellwig 已提交
403
}
404

C
Chaitanya Kulkarni 已提交
405 406
static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev,
		unsigned int size, unsigned int nseg, bool use_sgl)
C
Christoph Hellwig 已提交
407
{
C
Chaitanya Kulkarni 已提交
408 409 410 411 412 413 414 415
	size_t alloc_size;

	if (use_sgl)
		alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
	else
		alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);

	return alloc_size + sizeof(struct scatterlist) * nseg;
C
Christoph Hellwig 已提交
416
}
417

C
Chaitanya Kulkarni 已提交
418
static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl)
C
Christoph Hellwig 已提交
419
{
C
Chaitanya Kulkarni 已提交
420 421 422 423 424
	unsigned int alloc_size = nvme_pci_iod_alloc_size(dev,
				    NVME_INT_BYTES(dev), NVME_INT_PAGES,
				    use_sgl);

	return sizeof(struct nvme_iod) + alloc_size;
425 426
}

M
Matias Bjørling 已提交
427 428
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
429
{
M
Matias Bjørling 已提交
430
	struct nvme_dev *dev = data;
431
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
432

433 434 435 436
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
	WARN_ON(nvmeq->tags);

M
Matias Bjørling 已提交
437
	hctx->driver_data = nvmeq;
438
	nvmeq->tags = &dev->admin_tagset.tags[0];
M
Matias Bjørling 已提交
439
	return 0;
440 441
}

442 443 444 445 446 447 448
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	nvmeq->tags = NULL;
}

M
Matias Bjørling 已提交
449 450
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
451
{
M
Matias Bjørling 已提交
452
	struct nvme_dev *dev = data;
453
	struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
454

455 456
	if (!nvmeq->tags)
		nvmeq->tags = &dev->tagset.tags[hctx_idx];
M
Matthew Wilcox 已提交
457

458
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
459 460
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
461 462
}

463 464
static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
		unsigned int hctx_idx, unsigned int numa_node)
M
Matthew Wilcox 已提交
465
{
466
	struct nvme_dev *dev = set->driver_data;
C
Christoph Hellwig 已提交
467
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
468
	int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
469
	struct nvme_queue *nvmeq = &dev->queues[queue_idx];
M
Matias Bjørling 已提交
470 471

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
472
	iod->nvmeq = nvmeq;
473 474

	nvme_req(req)->ctrl = &dev->ctrl;
M
Matias Bjørling 已提交
475 476 477
	return 0;
}

478 479 480 481 482 483 484 485 486
static int queue_irq_offset(struct nvme_dev *dev)
{
	/* if we have more than 1 vec, admin queue offsets us by 1 */
	if (dev->num_vecs > 1)
		return 1;

	return 0;
}

487 488 489
static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
	struct nvme_dev *dev = set->driver_data;
490 491 492 493 494 495 496 497
	int i, qoff, offset;

	offset = queue_irq_offset(dev);
	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
		struct blk_mq_queue_map *map = &set->map[i];

		map->nr_queues = dev->io_queues[i];
		if (!map->nr_queues) {
498
			BUG_ON(i == HCTX_TYPE_DEFAULT);
499

500
			/* shared set, resuse read set parameters */
501
			map->nr_queues = dev->io_queues[HCTX_TYPE_DEFAULT];
502 503 504 505
			qoff = 0;
			offset = queue_irq_offset(dev);
		}

J
Jens Axboe 已提交
506 507 508 509
		/*
		 * The poll queue(s) doesn't have an IRQ (and hence IRQ
		 * affinity), so use the regular blk-mq cpu mapping
		 */
510
		map->queue_offset = qoff;
511
		if (i != HCTX_TYPE_POLL)
J
Jens Axboe 已提交
512 513 514
			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
		else
			blk_mq_map_queues(map);
515 516 517 518 519
		qoff += map->nr_queues;
		offset += map->nr_queues;
	}

	return 0;
520 521
}

522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
/*
 * Write sq tail if we are asked to, or if the next command would wrap.
 */
static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_sq)
{
	if (!write_sq) {
		u16 next_tail = nvmeq->sq_tail + 1;

		if (next_tail == nvmeq->q_depth)
			next_tail = 0;
		if (next_tail != nvmeq->last_sq_tail)
			return;
	}

	if (nvme_dbbuf_update_and_check_event(nvmeq->sq_tail,
			nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei))
		writel(nvmeq->sq_tail, nvmeq->q_db);
	nvmeq->last_sq_tail = nvmeq->sq_tail;
}

M
Matthew Wilcox 已提交
542
/**
543
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
544 545
 * @nvmeq: The queue to use
 * @cmd: The command to send
546
 * @write_sq: whether to write to the SQ doorbell
M
Matthew Wilcox 已提交
547
 */
548 549
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
			    bool write_sq)
M
Matthew Wilcox 已提交
550
{
551
	spin_lock(&nvmeq->sq_lock);
552
	memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
553 554
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
555 556 557 558 559 560 561 562 563 564 565
	nvme_write_sq_db(nvmeq, write_sq);
	spin_unlock(&nvmeq->sq_lock);
}

static void nvme_commit_rqs(struct blk_mq_hw_ctx *hctx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	spin_lock(&nvmeq->sq_lock);
	if (nvmeq->sq_tail != nvmeq->last_sq_tail)
		nvme_write_sq_db(nvmeq, true);
566
	spin_unlock(&nvmeq->sq_lock);
M
Matthew Wilcox 已提交
567 568
}

C
Chaitanya Kulkarni 已提交
569
static void **nvme_pci_iod_list(struct request *req)
M
Matthew Wilcox 已提交
570
{
C
Christoph Hellwig 已提交
571
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
572
	return (void **)(iod->sg + blk_rq_nr_phys_segments(req));
M
Matthew Wilcox 已提交
573 574
}

575 576 577
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
578
	int nseg = blk_rq_nr_phys_segments(req);
579 580
	unsigned int avg_seg_size;

581 582 583 584
	if (nseg == 0)
		return false;

	avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
585 586 587 588 589 590 591 592 593 594

	if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1))))
		return false;
	if (!iod->nvmeq->qid)
		return false;
	if (!sgl_threshold || avg_seg_size < sgl_threshold)
		return false;
	return true;
}

595
static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev)
596
{
C
Christoph Hellwig 已提交
597
	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
598
	int nseg = blk_rq_nr_phys_segments(rq);
599
	unsigned int size = blk_rq_payload_bytes(rq);
600

601 602
	iod->use_sgl = nvme_pci_use_sgls(dev, rq);

C
Christoph Hellwig 已提交
603
	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
604
		iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
C
Christoph Hellwig 已提交
605
		if (!iod->sg)
606
			return BLK_STS_RESOURCE;
C
Christoph Hellwig 已提交
607 608
	} else {
		iod->sg = iod->inline_sg;
609 610
	}

C
Christoph Hellwig 已提交
611 612 613 614
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;
	iod->length = size;
K
Keith Busch 已提交
615

616
	return BLK_STS_OK;
617 618
}

C
Christoph Hellwig 已提交
619
static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
620
{
C
Christoph Hellwig 已提交
621
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Chaitanya Kulkarni 已提交
622 623 624
	const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
	dma_addr_t dma_addr = iod->first_dma, next_dma_addr;

625 626 627
	int i;

	if (iod->npages == 0)
C
Chaitanya Kulkarni 已提交
628 629 630
		dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
			dma_addr);

631
	for (i = 0; i < iod->npages; i++) {
C
Chaitanya Kulkarni 已提交
632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
		void *addr = nvme_pci_iod_list(req)[i];

		if (iod->use_sgl) {
			struct nvme_sgl_desc *sg_list = addr;

			next_dma_addr =
			    le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr);
		} else {
			__le64 *prp_list = addr;

			next_dma_addr = le64_to_cpu(prp_list[last_prp]);
		}

		dma_pool_free(dev->prp_page_pool, addr, dma_addr);
		dma_addr = next_dma_addr;
647
	}
648

C
Christoph Hellwig 已提交
649
	if (iod->sg != iod->inline_sg)
650
		mempool_free(iod->sg, dev->iod_mempool);
K
Keith Busch 已提交
651 652
}

653 654 655 656 657 658 659 660 661 662 663 664 665 666
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
	int i;
	struct scatterlist *sg;

	for_each_sg(sgl, sg, nents, i) {
		dma_addr_t phys = sg_phys(sg);
		pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
			"dma_address:%pad dma_length:%d\n",
			i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
			sg_dma_len(sg));
	}
}

C
Chaitanya Kulkarni 已提交
667 668
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
		struct request *req, struct nvme_rw_command *cmnd)
M
Matthew Wilcox 已提交
669
{
C
Christoph Hellwig 已提交
670
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
671
	struct dma_pool *pool;
672
	int length = blk_rq_payload_bytes(req);
673
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
674 675
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
676
	u32 page_size = dev->ctrl.page_size;
677
	int offset = dma_addr & (page_size - 1);
678
	__le64 *prp_list;
C
Chaitanya Kulkarni 已提交
679
	void **list = nvme_pci_iod_list(req);
680
	dma_addr_t prp_dma;
681
	int nprps, i;
M
Matthew Wilcox 已提交
682

683
	length -= (page_size - offset);
684 685
	if (length <= 0) {
		iod->first_dma = 0;
C
Chaitanya Kulkarni 已提交
686
		goto done;
687
	}
M
Matthew Wilcox 已提交
688

689
	dma_len -= (page_size - offset);
M
Matthew Wilcox 已提交
690
	if (dma_len) {
691
		dma_addr += (page_size - offset);
M
Matthew Wilcox 已提交
692 693 694 695 696 697
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

698
	if (length <= page_size) {
699
		iod->first_dma = dma_addr;
C
Chaitanya Kulkarni 已提交
700
		goto done;
701 702
	}

703
	nprps = DIV_ROUND_UP(length, page_size);
704 705
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
706
		iod->npages = 0;
707 708
	} else {
		pool = dev->prp_page_pool;
709
		iod->npages = 1;
710 711
	}

712
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
713
	if (!prp_list) {
714
		iod->first_dma = dma_addr;
715
		iod->npages = -1;
716
		return BLK_STS_RESOURCE;
717
	}
718 719
	list[0] = prp_list;
	iod->first_dma = prp_dma;
720 721
	i = 0;
	for (;;) {
722
		if (i == page_size >> 3) {
723
			__le64 *old_prp_list = prp_list;
724
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
725
			if (!prp_list)
726
				return BLK_STS_RESOURCE;
727
			list[iod->npages++] = prp_list;
728 729 730
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
731 732
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
733 734 735
		dma_len -= page_size;
		dma_addr += page_size;
		length -= page_size;
736 737 738 739
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
740 741
		if (unlikely(dma_len < 0))
			goto bad_sgl;
742 743 744
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
745 746
	}

C
Chaitanya Kulkarni 已提交
747 748 749 750
done:
	cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);

751 752 753
	return BLK_STS_OK;

 bad_sgl:
754 755 756
	WARN(DO_ONCE(nvme_print_sgl, iod->sg, iod->nents),
			"Invalid SGL for payload:%d nents:%d\n",
			blk_rq_payload_bytes(req), iod->nents);
757
	return BLK_STS_IOERR;
M
Matthew Wilcox 已提交
758 759
}

C
Chaitanya Kulkarni 已提交
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
		struct scatterlist *sg)
{
	sge->addr = cpu_to_le64(sg_dma_address(sg));
	sge->length = cpu_to_le32(sg_dma_len(sg));
	sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}

static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
		dma_addr_t dma_addr, int entries)
{
	sge->addr = cpu_to_le64(dma_addr);
	if (entries < SGES_PER_PAGE) {
		sge->length = cpu_to_le32(entries * sizeof(*sge));
		sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
	} else {
		sge->length = cpu_to_le32(PAGE_SIZE);
		sge->type = NVME_SGL_FMT_SEG_DESC << 4;
	}
}

static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
782
		struct request *req, struct nvme_rw_command *cmd, int entries)
C
Chaitanya Kulkarni 已提交
783 784 785 786 787 788
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct dma_pool *pool;
	struct nvme_sgl_desc *sg_list;
	struct scatterlist *sg = iod->sg;
	dma_addr_t sgl_dma;
789
	int i = 0;
C
Chaitanya Kulkarni 已提交
790 791 792 793

	/* setting the transfer type as SGL */
	cmd->flags = NVME_CMD_SGL_METABUF;

794
	if (entries == 1) {
C
Chaitanya Kulkarni 已提交
795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
		nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
		return BLK_STS_OK;
	}

	if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
		pool = dev->prp_small_pool;
		iod->npages = 0;
	} else {
		pool = dev->prp_page_pool;
		iod->npages = 1;
	}

	sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
	if (!sg_list) {
		iod->npages = -1;
		return BLK_STS_RESOURCE;
	}

	nvme_pci_iod_list(req)[0] = sg_list;
	iod->first_dma = sgl_dma;

	nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);

	do {
		if (i == SGES_PER_PAGE) {
			struct nvme_sgl_desc *old_sg_desc = sg_list;
			struct nvme_sgl_desc *link = &old_sg_desc[i - 1];

			sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
			if (!sg_list)
				return BLK_STS_RESOURCE;

			i = 0;
			nvme_pci_iod_list(req)[iod->npages++] = sg_list;
			sg_list[i++] = *link;
			nvme_pci_sgl_set_seg(link, sgl_dma, entries);
		}

		nvme_pci_sgl_set_data(&sg_list[i++], sg);
		sg = sg_next(sg);
835
	} while (--entries > 0);
C
Chaitanya Kulkarni 已提交
836 837 838 839

	return BLK_STS_OK;
}

840
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
841
		struct nvme_command *cmnd)
842
{
C
Christoph Hellwig 已提交
843
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
844 845 846
	struct request_queue *q = req->q;
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;
847
	blk_status_t ret = BLK_STS_IOERR;
848
	int nr_mapped;
849

850
	sg_init_table(iod->sg, blk_rq_nr_phys_segments(req));
C
Christoph Hellwig 已提交
851 852 853
	iod->nents = blk_rq_map_sg(q, req, iod->sg);
	if (!iod->nents)
		goto out;
854

855
	ret = BLK_STS_RESOURCE;
856 857 858 859 860 861 862

	if (is_pci_p2pdma_page(sg_page(iod->sg)))
		nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
					  dma_dir);
	else
		nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
					     dma_dir,  DMA_ATTR_NO_WARN);
863
	if (!nr_mapped)
C
Christoph Hellwig 已提交
864
		goto out;
865

866
	if (iod->use_sgl)
867
		ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw, nr_mapped);
C
Chaitanya Kulkarni 已提交
868 869 870
	else
		ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);

871
	if (ret != BLK_STS_OK)
C
Christoph Hellwig 已提交
872
		goto out_unmap;
873

874
	ret = BLK_STS_IOERR;
C
Christoph Hellwig 已提交
875 876 877
	if (blk_integrity_rq(req)) {
		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
			goto out_unmap;
878

879 880
		sg_init_table(&iod->meta_sg, 1);
		if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
C
Christoph Hellwig 已提交
881
			goto out_unmap;
882

883
		if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
C
Christoph Hellwig 已提交
884
			goto out_unmap;
M
Matthew Wilcox 已提交
885

886
		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
887 888
	}

889
	return BLK_STS_OK;
M
Matthew Wilcox 已提交
890

C
Christoph Hellwig 已提交
891 892 893 894
out_unmap:
	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
	return ret;
M
Matthew Wilcox 已提交
895 896
}

C
Christoph Hellwig 已提交
897
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
898
{
C
Christoph Hellwig 已提交
899
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
900 901 902 903
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;

	if (iod->nents) {
904 905 906 907
		/* P2PDMA requests do not need to be unmapped */
		if (!is_pci_p2pdma_page(sg_page(iod->sg)))
			dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);

908
		if (blk_integrity_rq(req))
909
			dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
910
	}
K
Keith Busch 已提交
911

912
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
913
	nvme_free_iod(dev, req);
914
}
M
Matthew Wilcox 已提交
915

916 917 918
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
919
static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
M
Matias Bjørling 已提交
920
			 const struct blk_mq_queue_data *bd)
921
{
M
Matias Bjørling 已提交
922 923
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
924
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
925
	struct request *req = bd->rq;
C
Christoph Hellwig 已提交
926
	struct nvme_command cmnd;
927
	blk_status_t ret;
K
Keith Busch 已提交
928

929 930 931 932
	/*
	 * We should not need to do this, but we're still using this to
	 * ensure we can drain requests on a dying queue.
	 */
933
	if (unlikely(!test_bit(NVMEQ_ENABLED, &nvmeq->flags)))
934 935
		return BLK_STS_IOERR;

936
	ret = nvme_setup_cmd(ns, req, &cmnd);
937
	if (ret)
C
Christoph Hellwig 已提交
938
		return ret;
M
Matias Bjørling 已提交
939

940
	ret = nvme_init_iod(req, dev);
941
	if (ret)
942
		goto out_free_cmd;
M
Matias Bjørling 已提交
943

944
	if (blk_rq_nr_phys_segments(req)) {
945
		ret = nvme_map_data(dev, req, &cmnd);
946 947 948
		if (ret)
			goto out_cleanup_iod;
	}
M
Matias Bjørling 已提交
949

950
	blk_mq_start_request(req);
951
	nvme_submit_cmd(nvmeq, &cmnd, bd->last);
952
	return BLK_STS_OK;
953
out_cleanup_iod:
C
Christoph Hellwig 已提交
954
	nvme_free_iod(dev, req);
955 956
out_free_cmd:
	nvme_cleanup_cmd(req);
C
Christoph Hellwig 已提交
957
	return ret;
M
Matthew Wilcox 已提交
958
}
K
Keith Busch 已提交
959

960
static void nvme_pci_complete_rq(struct request *req)
961
{
C
Christoph Hellwig 已提交
962
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matias Bjørling 已提交
963

964 965
	nvme_unmap_data(iod->nvmeq->dev, req);
	nvme_complete_rq(req);
M
Matthew Wilcox 已提交
966 967
}

968
/* We read the CQE phase first to check if the rest of the entry is valid */
969
static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
970
{
971 972
	return (le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
			nvmeq->cq_phase;
973 974
}

975
static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
976
{
977
	u16 head = nvmeq->cq_head;
978

979 980 981
	if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
					      nvmeq->dbbuf_cq_ei))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
982
}
983

984
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
985
{
986
	volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
987
	struct request *req;
988

989 990 991 992 993
	if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
		dev_warn(nvmeq->dev->ctrl.device,
			"invalid id %d completed on queue %d\n",
			cqe->command_id, le16_to_cpu(cqe->sq_id));
		return;
M
Matthew Wilcox 已提交
994 995
	}

996 997 998 999 1000 1001 1002
	/*
	 * AEN requests are special as they don't time out and can
	 * survive any kind of queue freeze and often don't respond to
	 * aborts.  We don't even bother to allocate a struct request
	 * for them but rather special case them here.
	 */
	if (unlikely(nvmeq->qid == 0 &&
K
Keith Busch 已提交
1003
			cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
1004 1005
		nvme_complete_async_event(&nvmeq->dev->ctrl,
				cqe->status, &cqe->result);
J
Jens Axboe 已提交
1006
		return;
1007
	}
M
Matthew Wilcox 已提交
1008

1009 1010 1011
	req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
	nvme_end_request(req, cqe->status, cqe->result);
}
M
Matthew Wilcox 已提交
1012

1013
static void nvme_complete_cqes(struct nvme_queue *nvmeq, u16 start, u16 end)
M
Matthew Wilcox 已提交
1014
{
1015 1016 1017 1018 1019 1020
	while (start != end) {
		nvme_handle_cqe(nvmeq, start);
		if (++start == nvmeq->q_depth)
			start = 0;
	}
}
1021

1022 1023 1024 1025 1026
static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
{
	if (++nvmeq->cq_head == nvmeq->q_depth) {
		nvmeq->cq_head = 0;
		nvmeq->cq_phase = !nvmeq->cq_phase;
M
Matthew Wilcox 已提交
1027
	}
J
Jens Axboe 已提交
1028 1029
}

1030 1031
static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
				  u16 *end, unsigned int tag)
J
Jens Axboe 已提交
1032
{
1033
	int found = 0;
M
Matthew Wilcox 已提交
1034

1035
	*start = nvmeq->cq_head;
1036 1037 1038
	while (nvme_cqe_pending(nvmeq)) {
		if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
			found++;
1039
		nvme_update_cq_head(nvmeq);
1040
	}
1041
	*end = nvmeq->cq_head;
1042

1043
	if (*start != *end)
1044
		nvme_ring_cq_doorbell(nvmeq);
1045
	return found;
M
Matthew Wilcox 已提交
1046 1047 1048
}

static irqreturn_t nvme_irq(int irq, void *data)
1049 1050
{
	struct nvme_queue *nvmeq = data;
1051
	irqreturn_t ret = IRQ_NONE;
1052 1053
	u16 start, end;

1054 1055 1056 1057 1058
	/*
	 * The rmb/wmb pair ensures we see all updates from a previous run of
	 * the irq handler, even if that was on another CPU.
	 */
	rmb();
1059 1060
	if (nvmeq->cq_head != nvmeq->last_cq_head)
		ret = IRQ_HANDLED;
1061
	nvme_process_cq(nvmeq, &start, &end, -1);
1062
	nvmeq->last_cq_head = nvmeq->cq_head;
1063
	wmb();
1064

1065 1066 1067 1068 1069 1070
	if (start != end) {
		nvme_complete_cqes(nvmeq, start, end);
		return IRQ_HANDLED;
	}

	return ret;
1071 1072 1073 1074 1075
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
1076
	if (nvme_cqe_pending(nvmeq))
1077 1078
		return IRQ_WAKE_THREAD;
	return IRQ_NONE;
1079 1080
}

1081 1082 1083 1084 1085
/*
 * Poll for completions any queue, including those not dedicated to polling.
 * Can be called from any context.
 */
static int nvme_poll_irqdisable(struct nvme_queue *nvmeq, unsigned int tag)
J
Jens Axboe 已提交
1086
{
1087
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
1088
	u16 start, end;
1089
	int found;
J
Jens Axboe 已提交
1090

1091 1092 1093 1094 1095 1096 1097 1098 1099
	/*
	 * For a poll queue we need to protect against the polling thread
	 * using the CQ lock.  For normal interrupt driven threads we have
	 * to disable the interrupt to avoid racing with it.
	 */
	if (nvmeq->cq_vector == -1)
		spin_lock(&nvmeq->cq_poll_lock);
	else
		disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1100
	found = nvme_process_cq(nvmeq, &start, &end, tag);
1101 1102 1103 1104
	if (nvmeq->cq_vector == -1)
		spin_unlock(&nvmeq->cq_poll_lock);
	else
		enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
1105

1106
	nvme_complete_cqes(nvmeq, start, end);
1107
	return found;
J
Jens Axboe 已提交
1108 1109
}

1110
static int nvme_poll(struct blk_mq_hw_ctx *hctx)
1111 1112 1113 1114 1115 1116 1117 1118
{
	struct nvme_queue *nvmeq = hctx->driver_data;
	u16 start, end;
	bool found;

	if (!nvme_cqe_pending(nvmeq))
		return 0;

1119
	spin_lock(&nvmeq->cq_poll_lock);
1120
	found = nvme_process_cq(nvmeq, &start, &end, -1);
1121
	spin_unlock(&nvmeq->cq_poll_lock);
1122 1123 1124 1125 1126

	nvme_complete_cqes(nvmeq, start, end);
	return found;
}

1127
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
M
Matthew Wilcox 已提交
1128
{
1129
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1130
	struct nvme_queue *nvmeq = &dev->queues[0];
M
Matias Bjørling 已提交
1131
	struct nvme_command c;
M
Matthew Wilcox 已提交
1132

M
Matias Bjørling 已提交
1133 1134
	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
1135
	c.common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1136
	nvme_submit_cmd(nvmeq, &c, true);
1137 1138
}

M
Matthew Wilcox 已提交
1139
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
1140
{
M
Matthew Wilcox 已提交
1141 1142 1143 1144 1145 1146
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1147
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1148 1149 1150
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
1151
		struct nvme_queue *nvmeq, s16 vector)
M
Matthew Wilcox 已提交
1152 1153
{
	struct nvme_command c;
J
Jens Axboe 已提交
1154 1155 1156 1157
	int flags = NVME_QUEUE_PHYS_CONTIG;

	if (vector != -1)
		flags |= NVME_CQ_IRQ_ENABLED;
M
Matthew Wilcox 已提交
1158

1159
	/*
M
Minwoo Im 已提交
1160
	 * Note: we (ab)use the fact that the prp fields survive if no data
1161 1162
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1163 1164 1165 1166 1167 1168
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
J
Jens Axboe 已提交
1169 1170 1171 1172
	if (vector != -1)
		c.create_cq.irq_vector = cpu_to_le16(vector);
	else
		c.create_cq.irq_vector = 0;
M
Matthew Wilcox 已提交
1173

1174
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1175 1176 1177 1178 1179
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
1180
	struct nvme_ctrl *ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1181
	struct nvme_command c;
1182
	int flags = NVME_QUEUE_PHYS_CONTIG;
M
Matthew Wilcox 已提交
1183

1184 1185 1186 1187 1188 1189 1190 1191
	/*
	 * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
	 * set. Since URGENT priority is zeroes, it makes all queues
	 * URGENT.
	 */
	if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
		flags |= NVME_SQ_PRIO_MEDIUM;

1192
	/*
M
Minwoo Im 已提交
1193
	 * Note: we (ab)use the fact that the prp fields survive if no data
1194 1195
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1196 1197 1198 1199 1200 1201 1202 1203
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1204
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

1217
static void abort_endio(struct request *req, blk_status_t error)
1218
{
C
Christoph Hellwig 已提交
1219 1220
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
1221

1222 1223
	dev_warn(nvmeq->dev->ctrl.device,
		 "Abort status: 0x%x", nvme_req(req)->status);
1224 1225
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);
	blk_mq_free_request(req);
1226 1227
}

K
Keith Busch 已提交
1228 1229 1230 1231 1232 1233 1234 1235
static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
{

	/* If true, indicates loss of adapter communication, possibly by a
	 * NVMe Subsystem reset.
	 */
	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);

1236 1237 1238
	/* If there is a reset/reinit ongoing, we shouldn't reset again. */
	switch (dev->ctrl.state) {
	case NVME_CTRL_RESETTING:
1239
	case NVME_CTRL_CONNECTING:
K
Keith Busch 已提交
1240
		return false;
1241 1242 1243
	default:
		break;
	}
K
Keith Busch 已提交
1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271

	/* We shouldn't reset unless the controller is on fatal error state
	 * _or_ if we lost the communication with it.
	 */
	if (!(csts & NVME_CSTS_CFS) && !nssro)
		return false;

	return true;
}

static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
{
	/* Read a config register to help see what died. */
	u16 pci_status;
	int result;

	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
				      &pci_status);
	if (result == PCIBIOS_SUCCESSFUL)
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
			 csts, pci_status);
	else
		dev_warn(dev->ctrl.device,
			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
			 csts, result);
}

1272
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
1273
{
C
Christoph Hellwig 已提交
1274 1275
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
1276
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1277 1278
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
1279 1280
	u32 csts = readl(dev->bar + NVME_REG_CSTS);

W
Wen Xiong 已提交
1281 1282 1283 1284 1285 1286 1287
	/* If PCI error recovery process is happening, we cannot reset or
	 * the recovery mechanism will surely fail.
	 */
	mb();
	if (pci_channel_offline(to_pci_dev(dev->dev)))
		return BLK_EH_RESET_TIMER;

K
Keith Busch 已提交
1288 1289 1290 1291 1292 1293
	/*
	 * Reset immediately if the controller is failed
	 */
	if (nvme_should_reset(dev, csts)) {
		nvme_warn_reset(dev, csts);
		nvme_dev_disable(dev, false);
1294
		nvme_reset_ctrl(&dev->ctrl);
1295
		return BLK_EH_DONE;
K
Keith Busch 已提交
1296
	}
K
Keith Busch 已提交
1297

K
Keith Busch 已提交
1298 1299 1300
	/*
	 * Did we miss an interrupt?
	 */
1301
	if (nvme_poll_irqdisable(nvmeq, req->tag)) {
K
Keith Busch 已提交
1302 1303 1304
		dev_warn(dev->ctrl.device,
			 "I/O %d QID %d timeout, completion polled\n",
			 req->tag, nvmeq->qid);
1305
		return BLK_EH_DONE;
K
Keith Busch 已提交
1306 1307
	}

1308
	/*
1309 1310 1311
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
1312
	 * shutdown, so we return BLK_EH_DONE.
1313
	 */
1314 1315 1316
	switch (dev->ctrl.state) {
	case NVME_CTRL_CONNECTING:
	case NVME_CTRL_RESETTING:
1317
		dev_warn_ratelimited(dev->ctrl.device,
1318 1319
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
1320
		nvme_dev_disable(dev, false);
1321
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1322
		return BLK_EH_DONE;
1323 1324
	default:
		break;
K
Keith Busch 已提交
1325 1326
	}

1327 1328 1329 1330
	/*
 	 * Shutdown the controller immediately and schedule a reset if the
 	 * command was already aborted once before and still hasn't been
 	 * returned to the driver, or if this is the admin queue.
1331
	 */
C
Christoph Hellwig 已提交
1332
	if (!nvmeq->qid || iod->aborted) {
1333
		dev_warn(dev->ctrl.device,
1334 1335
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
1336
		nvme_dev_disable(dev, false);
1337
		nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1338

1339
		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
1340
		return BLK_EH_DONE;
K
Keith Busch 已提交
1341 1342
	}

1343
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
1344
		atomic_inc(&dev->ctrl.abort_limit);
1345
		return BLK_EH_RESET_TIMER;
1346
	}
1347
	iod->aborted = 1;
M
Matias Bjørling 已提交
1348

K
Keith Busch 已提交
1349 1350
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1351
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1352 1353
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

1354 1355 1356
	dev_warn(nvmeq->dev->ctrl.device,
		"I/O %d QID %d timeout, aborting\n",
		 req->tag, nvmeq->qid);
1357 1358

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
1359
			BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
1360 1361 1362 1363 1364 1365 1366 1367
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->timeout = ADMIN_TIMEOUT;
	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
K
Keith Busch 已提交
1368

1369 1370 1371 1372 1373 1374
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1375 1376
}

M
Matias Bjørling 已提交
1377 1378
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1379 1380
	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1381 1382
	if (!nvmeq->sq_cmds)
		return;
1383

1384 1385 1386 1387 1388 1389
	if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) {
		pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
				nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth));
	} else {
		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
				nvmeq->sq_cmds, nvmeq->sq_dma_addr);
1390
	}
1391 1392
}

1393
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1394 1395 1396
{
	int i;

1397 1398
	for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
		dev->ctrl.queue_count--;
1399
		nvme_free_queue(&dev->queues[i]);
1400
	}
1401 1402
}

K
Keith Busch 已提交
1403 1404
/**
 * nvme_suspend_queue - put queue into suspended state
1405
 * @nvmeq: queue to suspend
K
Keith Busch 已提交
1406 1407
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1408
{
1409
	if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
K
Keith Busch 已提交
1410
		return 1;
1411

1412
	/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
1413
	mb();
1414

1415
	nvmeq->dev->online_queues--;
1416
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
1417
		blk_mq_quiesce_queue(nvmeq->dev->ctrl.admin_q);
1418 1419 1420 1421
	if (nvmeq->cq_vector == -1)
		return 0;
	pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
	nvmeq->cq_vector = -1;
K
Keith Busch 已提交
1422 1423
	return 0;
}
M
Matthew Wilcox 已提交
1424

1425
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
K
Keith Busch 已提交
1426
{
1427
	struct nvme_queue *nvmeq = &dev->queues[0];
K
Keith Busch 已提交
1428

1429 1430 1431
	if (shutdown)
		nvme_shutdown_ctrl(&dev->ctrl);
	else
1432
		nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1433

1434
	nvme_poll_irqdisable(nvmeq, -1);
M
Matthew Wilcox 已提交
1435 1436
}

1437 1438 1439 1440
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1441 1442
	unsigned q_size_aligned = roundup(q_depth * entry_size,
					  dev->ctrl.page_size);
1443 1444

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1445
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1446
		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
1447
		q_depth = div_u64(mem_per_q, entry_size);
1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
				int qid, int depth)
{
1464 1465 1466 1467 1468 1469
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
		nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
		nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
						nvmeq->sq_cmds);
1470 1471 1472 1473
		if (nvmeq->sq_dma_addr) {
			set_bit(NVMEQ_SQ_CMB, &nvmeq->flags);
			return 0; 
		}
1474
	}
1475

1476 1477
	nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
				&nvmeq->sq_dma_addr, GFP_KERNEL);
1478 1479
	if (!nvmeq->sq_cmds)
		return -ENOMEM;
1480 1481 1482
	return 0;
}

1483
static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
M
Matthew Wilcox 已提交
1484
{
1485
	struct nvme_queue *nvmeq = &dev->queues[qid];
M
Matthew Wilcox 已提交
1486

1487 1488
	if (dev->ctrl.queue_count > qid)
		return 0;
M
Matthew Wilcox 已提交
1489

1490
	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
J
Joe Perches 已提交
1491
					  &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1492 1493 1494
	if (!nvmeq->cqes)
		goto free_nvmeq;

1495
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
M
Matthew Wilcox 已提交
1496 1497
		goto free_cqdma;

1498
	nvmeq->q_dmadev = dev->dev;
M
Matthew Wilcox 已提交
1499
	nvmeq->dev = dev;
1500
	spin_lock_init(&nvmeq->sq_lock);
1501
	spin_lock_init(&nvmeq->cq_poll_lock);
M
Matthew Wilcox 已提交
1502
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1503
	nvmeq->cq_phase = 1;
1504
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
M
Matthew Wilcox 已提交
1505
	nvmeq->q_depth = depth;
K
Keith Busch 已提交
1506
	nvmeq->qid = qid;
1507
	nvmeq->cq_vector = -1;
1508
	dev->ctrl.queue_count++;
1509

1510
	return 0;
M
Matthew Wilcox 已提交
1511 1512

 free_cqdma:
1513
	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
M
Matthew Wilcox 已提交
1514 1515
							nvmeq->cq_dma_addr);
 free_nvmeq:
1516
	return -ENOMEM;
M
Matthew Wilcox 已提交
1517 1518
}

1519
static int queue_request_irq(struct nvme_queue *nvmeq)
1520
{
1521 1522 1523 1524 1525 1526 1527 1528 1529 1530
	struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
	int nr = nvmeq->dev->ctrl.instance;

	if (use_threaded_interrupts) {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
				nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	} else {
		return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
				NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
	}
1531 1532
}

1533
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1534
{
1535
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1536

1537
	nvmeq->sq_tail = 0;
1538
	nvmeq->last_sq_tail = 0;
1539 1540
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1541
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1542
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
1543
	nvme_dbbuf_init(dev, nvmeq, qid);
K
Keith Busch 已提交
1544
	dev->online_queues++;
1545
	wmb(); /* ensure the first interrupt sees the initialization */
1546 1547
}

J
Jens Axboe 已提交
1548
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
1549 1550 1551
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1552
	s16 vector;
1553

1554 1555
	clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);

1556 1557 1558 1559
	/*
	 * A queue's vector matches the queue identifier unless the controller
	 * has only one vector available.
	 */
J
Jens Axboe 已提交
1560 1561 1562 1563 1564
	if (!polled)
		vector = dev->num_vecs == 1 ? 0 : qid;
	else
		vector = -1;

1565
	result = adapter_alloc_cq(dev, qid, nvmeq, vector);
K
Keith Busch 已提交
1566 1567
	if (result)
		return result;
M
Matthew Wilcox 已提交
1568 1569 1570

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
K
Keith Busch 已提交
1571 1572
		return result;
	else if (result)
M
Matthew Wilcox 已提交
1573 1574
		goto release_cq;

1575
	nvmeq->cq_vector = vector;
1576
	nvme_init_queue(nvmeq, qid);
J
Jens Axboe 已提交
1577 1578 1579 1580 1581 1582

	if (vector != -1) {
		result = queue_request_irq(nvmeq);
		if (result < 0)
			goto release_sq;
	}
M
Matthew Wilcox 已提交
1583

1584
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
1585
	return result;
M
Matthew Wilcox 已提交
1586

1587 1588
release_sq:
	nvmeq->cq_vector = -1;
1589
	dev->online_queues--;
M
Matthew Wilcox 已提交
1590
	adapter_delete_sq(dev, qid);
1591
release_cq:
M
Matthew Wilcox 已提交
1592
	adapter_delete_cq(dev, qid);
1593
	return result;
M
Matthew Wilcox 已提交
1594 1595
}

1596
static const struct blk_mq_ops nvme_mq_admin_ops = {
1597
	.queue_rq	= nvme_queue_rq,
1598
	.complete	= nvme_pci_complete_rq,
M
Matias Bjørling 已提交
1599
	.init_hctx	= nvme_admin_init_hctx,
1600
	.exit_hctx      = nvme_admin_exit_hctx,
1601
	.init_request	= nvme_init_request,
M
Matias Bjørling 已提交
1602 1603 1604
	.timeout	= nvme_timeout,
};

1605
static const struct blk_mq_ops nvme_mq_ops = {
1606 1607 1608 1609 1610 1611 1612 1613
	.queue_rq	= nvme_queue_rq,
	.complete	= nvme_pci_complete_rq,
	.commit_rqs	= nvme_commit_rqs,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.map_queues	= nvme_pci_map_queues,
	.timeout	= nvme_timeout,
	.poll		= nvme_poll,
1614 1615
};

1616 1617
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1618
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
1619 1620 1621 1622 1623
		/*
		 * If the controller was reset during removal, it's possible
		 * user requests may be waiting on a stopped queue. Start the
		 * queue to flush these to completion.
		 */
1624
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
1625
		blk_cleanup_queue(dev->ctrl.admin_q);
1626 1627 1628 1629
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1630 1631
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1632
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1633 1634
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
K
Keith Busch 已提交
1635

K
Keith Busch 已提交
1636
		dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
M
Matias Bjørling 已提交
1637
		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1638
		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
C
Chaitanya Kulkarni 已提交
1639
		dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
1640
		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
M
Matias Bjørling 已提交
1641 1642 1643 1644
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;
1645
		dev->ctrl.admin_tagset = &dev->admin_tagset;
M
Matias Bjørling 已提交
1646

1647 1648
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1649 1650 1651
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1652
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1653
			nvme_dev_remove_admin(dev);
1654
			dev->ctrl.admin_q = NULL;
1655 1656
			return -ENODEV;
		}
K
Keith Busch 已提交
1657
	} else
1658
		blk_mq_unquiesce_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1659 1660 1661 1662

	return 0;
}

1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
	return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
}

static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (size <= dev->bar_mapped_size)
		return 0;
	if (size > pci_resource_len(pdev, 0))
		return -ENOMEM;
	if (dev->bar)
		iounmap(dev->bar);
	dev->bar = ioremap(pci_resource_start(pdev, 0), size);
	if (!dev->bar) {
		dev->bar_mapped_size = 0;
		return -ENOMEM;
	}
	dev->bar_mapped_size = size;
	dev->dbs = dev->bar + NVME_REG_DBS;

	return 0;
}

1689
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1690
{
1691
	int result;
M
Matthew Wilcox 已提交
1692 1693 1694
	u32 aqa;
	struct nvme_queue *nvmeq;

1695 1696 1697 1698
	result = nvme_remap_bar(dev, db_bar_size(dev, 0));
	if (result < 0)
		return result;

1699
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
1700
				NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
1701

1702 1703 1704
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1705

1706
	result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
1707 1708
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1709

1710
	result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1711 1712
	if (result)
		return result;
M
Matthew Wilcox 已提交
1713

1714
	nvmeq = &dev->queues[0];
M
Matthew Wilcox 已提交
1715 1716 1717
	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1718 1719 1720
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1721

1722
	result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
1723
	if (result)
K
Keith Busch 已提交
1724
		return result;
M
Matias Bjørling 已提交
1725

K
Keith Busch 已提交
1726
	nvmeq->cq_vector = 0;
1727
	nvme_init_queue(nvmeq, 0);
1728
	result = queue_request_irq(nvmeq);
1729 1730
	if (result) {
		nvmeq->cq_vector = -1;
K
Keith Busch 已提交
1731
		return result;
1732
	}
1733

1734
	set_bit(NVMEQ_ENABLED, &nvmeq->flags);
M
Matthew Wilcox 已提交
1735 1736 1737
	return result;
}

1738
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1739
{
J
Jens Axboe 已提交
1740
	unsigned i, max, rw_queues;
1741
	int ret = 0;
K
Keith Busch 已提交
1742

1743
	for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
1744
		if (nvme_alloc_queue(dev, i, dev->q_depth)) {
1745
			ret = -ENOMEM;
K
Keith Busch 已提交
1746
			break;
1747 1748
		}
	}
K
Keith Busch 已提交
1749

1750
	max = min(dev->max_qid, dev->ctrl.queue_count - 1);
1751 1752 1753
	if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
		rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
				dev->io_queues[HCTX_TYPE_READ];
J
Jens Axboe 已提交
1754 1755 1756 1757
	} else {
		rw_queues = max;
	}

1758
	for (i = dev->online_queues; i <= max; i++) {
J
Jens Axboe 已提交
1759 1760 1761
		bool polled = i > rw_queues;

		ret = nvme_create_queue(&dev->queues[i], i, polled);
K
Keith Busch 已提交
1762
		if (ret)
K
Keith Busch 已提交
1763
			break;
M
Matthew Wilcox 已提交
1764
	}
1765 1766 1767

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
1768 1769
	 * than the desired amount of queues, and even a controller without
	 * I/O queues can still be used to issue admin commands.  This might
1770 1771 1772
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
M
Matthew Wilcox 已提交
1773 1774
}

1775 1776 1777 1778 1779 1780
static ssize_t nvme_cmb_show(struct device *dev,
			     struct device_attribute *attr,
			     char *buf)
{
	struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev));

1781
	return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz  : x%08x\n",
1782 1783 1784 1785
		       ndev->cmbloc, ndev->cmbsz);
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);

1786
static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
1787
{
1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
	u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;

	return 1ULL << (12 + 4 * szu);
}

static u32 nvme_cmb_size(struct nvme_dev *dev)
{
	return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
}

1798
static void nvme_map_cmb(struct nvme_dev *dev)
1799
{
1800
	u64 size, offset;
1801 1802
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1803
	int bar;
1804

1805 1806 1807
	if (dev->cmb_size)
		return;

1808
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1809 1810
	if (!dev->cmbsz)
		return;
1811
	dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1812

1813 1814
	size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
	offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
1815 1816
	bar = NVME_CMB_BIR(dev->cmbloc);
	bar_size = pci_resource_len(pdev, bar);
1817 1818

	if (offset > bar_size)
1819
		return;
1820 1821 1822 1823 1824 1825 1826 1827 1828

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

1829 1830 1831
	if (pci_p2pdma_add_resource(pdev, bar, size, offset)) {
		dev_warn(dev->ctrl.device,
			 "failed to register the CMB\n");
1832
		return;
1833 1834
	}

1835
	dev->cmb_size = size;
1836 1837 1838 1839 1840
	dev->cmb_use_sqes = use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS);

	if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) ==
			(NVME_CMBSZ_WDS | NVME_CMBSZ_RDS))
		pci_p2pmem_publish(pdev, true);
1841 1842 1843 1844 1845

	if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
				    &dev_attr_cmb.attr, NULL))
		dev_warn(dev->ctrl.device,
			 "failed to add sysfs attribute for CMB\n");
1846 1847 1848 1849
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
1850
	if (dev->cmb_size) {
1851 1852
		sysfs_remove_file_from_group(&dev->ctrl.device->kobj,
					     &dev_attr_cmb.attr, NULL);
1853
		dev->cmb_size = 0;
1854 1855 1856
	}
}

1857 1858
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
1859
	u64 dma_addr = dev->host_mem_descs_dma;
1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895
	struct nvme_command c;
	int ret;

	memset(&c, 0, sizeof(c));
	c.features.opcode	= nvme_admin_set_features;
	c.features.fid		= cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
	c.features.dword11	= cpu_to_le32(bits);
	c.features.dword12	= cpu_to_le32(dev->host_mem_size >>
					      ilog2(dev->ctrl.page_size));
	c.features.dword13	= cpu_to_le32(lower_32_bits(dma_addr));
	c.features.dword14	= cpu_to_le32(upper_32_bits(dma_addr));
	c.features.dword15	= cpu_to_le32(dev->nr_host_mem_descs);

	ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
	if (ret) {
		dev_warn(dev->ctrl.device,
			 "failed to set host mem (err %d, flags %#x).\n",
			 ret, bits);
	}
	return ret;
}

static void nvme_free_host_mem(struct nvme_dev *dev)
{
	int i;

	for (i = 0; i < dev->nr_host_mem_descs; i++) {
		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
		size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;

		dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
				le64_to_cpu(desc->addr));
	}

	kfree(dev->host_mem_desc_bufs);
	dev->host_mem_desc_bufs = NULL;
1896 1897 1898
	dma_free_coherent(dev->dev,
			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
			dev->host_mem_descs, dev->host_mem_descs_dma);
1899
	dev->host_mem_descs = NULL;
1900
	dev->nr_host_mem_descs = 0;
1901 1902
}

1903 1904
static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
		u32 chunk_size)
K
Keith Busch 已提交
1905
{
1906
	struct nvme_host_mem_buf_desc *descs;
1907
	u32 max_entries, len;
1908
	dma_addr_t descs_dma;
1909
	int i = 0;
1910
	void **bufs;
1911
	u64 size, tmp;
1912 1913 1914 1915

	tmp = (preferred + chunk_size - 1);
	do_div(tmp, chunk_size);
	max_entries = tmp;
1916 1917 1918 1919

	if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries)
		max_entries = dev->ctrl.hmmaxd;

1920 1921
	descs = dma_zalloc_coherent(dev->dev, max_entries * sizeof(*descs),
			&descs_dma, GFP_KERNEL);
1922 1923 1924 1925 1926 1927 1928
	if (!descs)
		goto out;

	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
	if (!bufs)
		goto out_free_descs;

1929
	for (size = 0; size < preferred && i < max_entries; size += len) {
1930 1931
		dma_addr_t dma_addr;

1932
		len = min_t(u64, chunk_size, preferred - size);
1933 1934 1935 1936 1937 1938 1939 1940 1941 1942
		bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
				DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
		if (!bufs[i])
			break;

		descs[i].addr = cpu_to_le64(dma_addr);
		descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
		i++;
	}

1943
	if (!size)
1944 1945 1946 1947 1948
		goto out_free_bufs;

	dev->nr_host_mem_descs = i;
	dev->host_mem_size = size;
	dev->host_mem_descs = descs;
1949
	dev->host_mem_descs_dma = descs_dma;
1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
	dev->host_mem_desc_bufs = bufs;
	return 0;

out_free_bufs:
	while (--i >= 0) {
		size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;

		dma_free_coherent(dev->dev, size, bufs[i],
				le64_to_cpu(descs[i].addr));
	}

	kfree(bufs);
out_free_descs:
1963 1964
	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
			descs_dma);
1965 1966 1967 1968 1969
out:
	dev->host_mem_descs = NULL;
	return -ENOMEM;
}

1970 1971 1972 1973 1974
static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
{
	u32 chunk_size;

	/* start big and work our way down */
1975
	for (chunk_size = min_t(u64, preferred, PAGE_SIZE * MAX_ORDER_NR_PAGES);
1976
	     chunk_size >= max_t(u32, dev->ctrl.hmminds * 4096, PAGE_SIZE * 2);
1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987
	     chunk_size /= 2) {
		if (!__nvme_alloc_host_mem(dev, preferred, chunk_size)) {
			if (!min || dev->host_mem_size >= min)
				return 0;
			nvme_free_host_mem(dev);
		}
	}

	return -ENOMEM;
}

1988
static int nvme_setup_host_mem(struct nvme_dev *dev)
1989 1990 1991 1992 1993
{
	u64 max = (u64)max_host_mem_size_mb * SZ_1M;
	u64 preferred = (u64)dev->ctrl.hmpre * 4096;
	u64 min = (u64)dev->ctrl.hmmin * 4096;
	u32 enable_bits = NVME_HOST_MEM_ENABLE;
1994
	int ret;
1995 1996 1997 1998 1999 2000 2001

	preferred = min(preferred, max);
	if (min > max) {
		dev_warn(dev->ctrl.device,
			"min host memory (%lld MiB) above limit (%d MiB).\n",
			min >> ilog2(SZ_1M), max_host_mem_size_mb);
		nvme_free_host_mem(dev);
2002
		return 0;
2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015
	}

	/*
	 * If we already have a buffer allocated check if we can reuse it.
	 */
	if (dev->host_mem_descs) {
		if (dev->host_mem_size >= min)
			enable_bits |= NVME_HOST_MEM_RETURN;
		else
			nvme_free_host_mem(dev);
	}

	if (!dev->host_mem_descs) {
2016 2017 2018
		if (nvme_alloc_host_mem(dev, min, preferred)) {
			dev_warn(dev->ctrl.device,
				"failed to allocate host memory buffer.\n");
2019
			return 0; /* controller must work without HMB */
2020 2021 2022 2023 2024
		}

		dev_info(dev->ctrl.device,
			"allocated %lld MiB host memory buffer.\n",
			dev->host_mem_size >> ilog2(SZ_1M));
2025 2026
	}

2027 2028
	ret = nvme_set_host_mem(dev, enable_bits);
	if (ret)
2029
		nvme_free_host_mem(dev);
2030
	return ret;
K
Keith Busch 已提交
2031 2032
}

2033 2034 2035
static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int nr_io_queues)
{
	unsigned int this_w_queues = write_queues;
J
Jens Axboe 已提交
2036
	unsigned int this_p_queues = poll_queues;
2037 2038 2039 2040 2041

	/*
	 * Setup read/write queue split
	 */
	if (nr_io_queues == 1) {
2042 2043 2044
		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
		dev->io_queues[HCTX_TYPE_READ] = 0;
		dev->io_queues[HCTX_TYPE_POLL] = 0;
2045 2046 2047
		return;
	}

J
Jens Axboe 已提交
2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060
	/*
	 * Configure number of poll queues, if set
	 */
	if (this_p_queues) {
		/*
		 * We need at least one queue left. With just one queue, we'll
		 * have a single shared read/write set.
		 */
		if (this_p_queues >= nr_io_queues) {
			this_w_queues = 0;
			this_p_queues = nr_io_queues - 1;
		}

2061
		dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
J
Jens Axboe 已提交
2062 2063
		nr_io_queues -= this_p_queues;
	} else
2064
		dev->io_queues[HCTX_TYPE_POLL] = 0;
J
Jens Axboe 已提交
2065

2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077
	/*
	 * If 'write_queues' is set, ensure it leaves room for at least
	 * one read queue
	 */
	if (this_w_queues >= nr_io_queues)
		this_w_queues = nr_io_queues - 1;

	/*
	 * If 'write_queues' is set to zero, reads and writes will share
	 * a queue set.
	 */
	if (!this_w_queues) {
2078 2079
		dev->io_queues[HCTX_TYPE_DEFAULT] = nr_io_queues;
		dev->io_queues[HCTX_TYPE_READ] = 0;
2080
	} else {
2081 2082
		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
		dev->io_queues[HCTX_TYPE_READ] = nr_io_queues - this_w_queues;
2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094
	}
}

static int nvme_setup_irqs(struct nvme_dev *dev, int nr_io_queues)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	int irq_sets[2];
	struct irq_affinity affd = {
		.pre_vectors = 1,
		.nr_sets = ARRAY_SIZE(irq_sets),
		.sets = irq_sets,
	};
2095
	int result = 0;
2096 2097 2098 2099 2100 2101 2102 2103

	/*
	 * For irq sets, we have to ask for minvec == maxvec. This passes
	 * any reduction back to us, so we can adjust our queue counts and
	 * IRQ vector needs.
	 */
	do {
		nvme_calc_io_queues(dev, nr_io_queues);
2104 2105
		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
2106 2107 2108 2109
		if (!irq_sets[1])
			affd.nr_sets = 1;

		/*
2110 2111 2112
		 * If we got a failure and we're down to asking for just
		 * 1 + 1 queues, just ask for a single vector. We'll share
		 * that between the single IO queue and the admin queue.
2113
		 */
2114
		if (!(result < 0 && nr_io_queues == 1))
2115
			nr_io_queues = irq_sets[0] + irq_sets[1] + 1;
2116 2117 2118 2119 2120 2121

		result = pci_alloc_irq_vectors_affinity(pdev, nr_io_queues,
				nr_io_queues,
				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);

		/*
2122 2123 2124 2125
		 * Need to reduce our vec counts. If we get ENOSPC, the
		 * platform should support mulitple vecs, we just need
		 * to decrease our ask. If we get EINVAL, the platform
		 * likely does not. Back down to ask for just one vector.
2126 2127 2128 2129 2130 2131
		 */
		if (result == -ENOSPC) {
			nr_io_queues--;
			if (!nr_io_queues)
				return result;
			continue;
2132 2133 2134
		} else if (result == -EINVAL) {
			nr_io_queues = 1;
			continue;
2135 2136 2137 2138 2139 2140 2141 2142
		} else if (result <= 0)
			return -EIO;
		break;
	} while (1);

	return result;
}

2143
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2144
{
2145
	struct nvme_queue *adminq = &dev->queues[0];
2146
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2147 2148
	int result, nr_io_queues;
	unsigned long size;
M
Matthew Wilcox 已提交
2149

2150
	nr_io_queues = max_io_queues();
C
Christoph Hellwig 已提交
2151 2152
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
2153
		return result;
C
Christoph Hellwig 已提交
2154

2155
	if (nr_io_queues == 0)
2156
		return 0;
2157 2158
	
	clear_bit(NVMEQ_ENABLED, &adminq->flags);
M
Matthew Wilcox 已提交
2159

2160
	if (dev->cmb_use_sqes) {
2161 2162 2163 2164 2165
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
2166
			dev->cmb_use_sqes = false;
2167 2168
	}

2169 2170 2171 2172 2173 2174 2175 2176 2177
	do {
		size = db_bar_size(dev, nr_io_queues);
		result = nvme_remap_bar(dev, size);
		if (!result)
			break;
		if (!--nr_io_queues)
			return -ENOMEM;
	} while (1);
	adminq->q_db = dev->dbs;
2178

K
Keith Busch 已提交
2179
	/* Deregister the admin queue's interrupt */
2180
	pci_free_irq(pdev, 0, adminq);
K
Keith Busch 已提交
2181

2182 2183 2184 2185
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
2186
	pci_free_irq_vectors(pdev);
2187 2188

	result = nvme_setup_irqs(dev, nr_io_queues);
2189
	if (result <= 0)
2190
		return -EIO;
2191

2192
	dev->num_vecs = result;
J
Jens Axboe 已提交
2193
	result = max(result - 1, 1);
2194
	dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
R
Ramachandra Rao Gajula 已提交
2195

2196 2197 2198 2199
	dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
					dev->io_queues[HCTX_TYPE_DEFAULT],
					dev->io_queues[HCTX_TYPE_READ],
					dev->io_queues[HCTX_TYPE_POLL]);
2200

2201 2202 2203 2204 2205 2206 2207
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */

2208
	result = queue_request_irq(adminq);
2209 2210
	if (result) {
		adminq->cq_vector = -1;
K
Keith Busch 已提交
2211
		return result;
2212
	}
2213
	set_bit(NVMEQ_ENABLED, &adminq->flags);
2214
	return nvme_create_io_queues(dev);
M
Matthew Wilcox 已提交
2215 2216
}

2217
static void nvme_del_queue_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2218
{
K
Keith Busch 已提交
2219
	struct nvme_queue *nvmeq = req->end_io_data;
2220

K
Keith Busch 已提交
2221
	blk_mq_free_request(req);
2222
	complete(&nvmeq->delete_done);
K
Keith Busch 已提交
2223 2224
}

2225
static void nvme_del_cq_end(struct request *req, blk_status_t error)
K
Keith Busch 已提交
2226
{
K
Keith Busch 已提交
2227
	struct nvme_queue *nvmeq = req->end_io_data;
K
Keith Busch 已提交
2228

2229 2230
	if (error)
		set_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
K
Keith Busch 已提交
2231 2232

	nvme_del_queue_end(req, error);
K
Keith Busch 已提交
2233 2234
}

K
Keith Busch 已提交
2235
static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
2236
{
K
Keith Busch 已提交
2237 2238 2239
	struct request_queue *q = nvmeq->dev->ctrl.admin_q;
	struct request *req;
	struct nvme_command cmd;
2240

K
Keith Busch 已提交
2241 2242 2243
	memset(&cmd, 0, sizeof(cmd));
	cmd.delete_queue.opcode = opcode;
	cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid);
2244

2245
	req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT, NVME_QID_ANY);
K
Keith Busch 已提交
2246 2247
	if (IS_ERR(req))
		return PTR_ERR(req);
2248

K
Keith Busch 已提交
2249 2250 2251
	req->timeout = ADMIN_TIMEOUT;
	req->end_io_data = nvmeq;

2252
	init_completion(&nvmeq->delete_done);
K
Keith Busch 已提交
2253 2254 2255 2256
	blk_execute_rq_nowait(q, NULL, req, false,
			opcode == nvme_admin_delete_cq ?
				nvme_del_cq_end : nvme_del_queue_end);
	return 0;
2257 2258
}

2259
static bool nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
K
Keith Busch 已提交
2260
{
2261
	int nr_queues = dev->online_queues - 1, sent = 0;
K
Keith Busch 已提交
2262
	unsigned long timeout;
K
Keith Busch 已提交
2263

K
Keith Busch 已提交
2264
 retry:
2265 2266 2267 2268 2269 2270
	timeout = ADMIN_TIMEOUT;
	while (nr_queues > 0) {
		if (nvme_delete_queue(&dev->queues[nr_queues], opcode))
			break;
		nr_queues--;
		sent++;
K
Keith Busch 已提交
2271
	}
2272 2273 2274 2275
	while (sent) {
		struct nvme_queue *nvmeq = &dev->queues[nr_queues + sent];

		timeout = wait_for_completion_io_timeout(&nvmeq->delete_done,
2276 2277 2278
				timeout);
		if (timeout == 0)
			return false;
2279 2280 2281 2282 2283 2284 2285

		/* handle any remaining CQEs */
		if (opcode == nvme_admin_delete_cq &&
		    !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
			nvme_poll_irqdisable(nvmeq, -1);

		sent--;
2286 2287 2288 2289
		if (nr_queues)
			goto retry;
	}
	return true;
K
Keith Busch 已提交
2290 2291
}

2292
/*
2293
 * return error value only when tagset allocation failed
2294
 */
2295
static int nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2296
{
2297 2298
	int ret;

2299
	if (!dev->ctrl.tagset) {
2300
		dev->tagset.ops = &nvme_mq_ops;
2301
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
2302
		dev->tagset.nr_maps = HCTX_MAX_TYPES;
2303 2304 2305
		dev->tagset.timeout = NVME_IO_TIMEOUT;
		dev->tagset.numa_node = dev_to_node(dev->dev);
		dev->tagset.queue_depth =
M
Matias Bjørling 已提交
2306
				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
C
Chaitanya Kulkarni 已提交
2307 2308 2309 2310 2311
		dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false);
		if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) {
			dev->tagset.cmd_size = max(dev->tagset.cmd_size,
					nvme_pci_cmd_size(dev, true));
		}
2312 2313
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
2314

2315 2316 2317 2318 2319 2320
		ret = blk_mq_alloc_tag_set(&dev->tagset);
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
			return ret;
		}
2321
		dev->ctrl.tagset = &dev->tagset;
2322 2323

		nvme_dbbuf_set(dev);
2324 2325 2326 2327 2328
	} else {
		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);

		/* Free previously allocated queues that are no longer usable */
		nvme_free_queues(dev, dev->online_queues);
2329
	}
2330

K
Keith Busch 已提交
2331
	return 0;
M
Matthew Wilcox 已提交
2332 2333
}

2334
static int nvme_pci_enable(struct nvme_dev *dev)
2335
{
2336
	int result = -ENOMEM;
2337
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2338 2339 2340 2341 2342 2343

	if (pci_enable_device_mem(pdev))
		return result;

	pci_set_master(pdev);

2344 2345
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
2346
		goto disable;
2347

2348
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2349
		result = -ENODEV;
2350
		goto disable;
K
Keith Busch 已提交
2351
	}
2352 2353

	/*
2354 2355 2356
	 * Some devices and/or platforms don't advertise or work with INTx
	 * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
	 * adjust this later.
2357
	 */
2358 2359 2360
	result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
	if (result < 0)
		return result;
2361

2362
	dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
2363

2364
	dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
2365
				io_queue_depth);
2366
	dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
2367
	dev->dbs = dev->bar + 4096;
2368 2369 2370 2371 2372 2373 2374

	/*
	 * Temporary fix for the Apple controller found in the MacBook8,1 and
	 * some MacBook7,1 to avoid controller resets and data loss.
	 */
	if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
		dev->q_depth = 2;
2375 2376
		dev_warn(dev->ctrl.device, "detected Apple NVMe controller, "
			"set queue depth=%u to work around controller resets\n",
2377
			dev->q_depth);
2378 2379
	} else if (pdev->vendor == PCI_VENDOR_ID_SAMSUNG &&
		   (pdev->device == 0xa821 || pdev->device == 0xa822) &&
2380
		   NVME_CAP_MQES(dev->ctrl.cap) == 0) {
2381 2382 2383
		dev->q_depth = 64;
		dev_err(dev->ctrl.device, "detected PM1725 NVMe controller, "
                        "set queue depth=%u\n", dev->q_depth);
2384 2385
	}

2386
	nvme_map_cmb(dev);
2387

K
Keith Busch 已提交
2388 2389
	pci_enable_pcie_error_reporting(pdev);
	pci_save_state(pdev);
2390 2391 2392 2393 2394 2395 2396 2397
	return 0;

 disable:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
2398 2399 2400
{
	if (dev->bar)
		iounmap(dev->bar);
2401
	pci_release_mem_regions(to_pci_dev(dev->dev));
2402 2403 2404
}

static void nvme_pci_disable(struct nvme_dev *dev)
2405
{
2406 2407
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2408
	pci_free_irq_vectors(pdev);
2409

K
Keith Busch 已提交
2410 2411
	if (pci_is_enabled(pdev)) {
		pci_disable_pcie_error_reporting(pdev);
2412
		pci_disable_device(pdev);
K
Keith Busch 已提交
2413 2414 2415
	}
}

2416
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
M
Matthew Wilcox 已提交
2417
{
2418
	int i;
K
Keith Busch 已提交
2419 2420
	bool dead = true;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
2421

2422
	mutex_lock(&dev->shutdown_lock);
K
Keith Busch 已提交
2423 2424 2425
	if (pci_is_enabled(pdev)) {
		u32 csts = readl(dev->bar + NVME_REG_CSTS);

K
Keith Busch 已提交
2426 2427
		if (dev->ctrl.state == NVME_CTRL_LIVE ||
		    dev->ctrl.state == NVME_CTRL_RESETTING)
K
Keith Busch 已提交
2428 2429 2430
			nvme_start_freeze(&dev->ctrl);
		dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
			pdev->error_state  != pci_channel_io_normal);
2431
	}
2432

K
Keith Busch 已提交
2433 2434 2435 2436
	/*
	 * Give the controller a chance to complete all entered requests if
	 * doing a safe shutdown.
	 */
2437 2438 2439
	if (!dead) {
		if (shutdown)
			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
2440 2441 2442
	}

	nvme_stop_queues(&dev->ctrl);
2443

2444
	if (!dead && dev->ctrl.queue_count > 0) {
2445 2446
		if (nvme_disable_io_queues(dev, nvme_admin_delete_sq))
			nvme_disable_io_queues(dev, nvme_admin_delete_cq);
2447
		nvme_disable_admin_queue(dev, shutdown);
K
Keith Busch 已提交
2448
	}
2449 2450 2451
	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
		nvme_suspend_queue(&dev->queues[i]);

2452
	nvme_pci_disable(dev);
2453

2454 2455
	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
K
Keith Busch 已提交
2456 2457 2458 2459 2460 2461 2462 2463

	/*
	 * The driver will not be starting up queues again if shutting down so
	 * must flush all entered requests to their failed completion to avoid
	 * deadlocking blk-mq hot-cpu notifier.
	 */
	if (shutdown)
		nvme_start_queues(&dev->ctrl);
2464
	mutex_unlock(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2465 2466
}

M
Matthew Wilcox 已提交
2467 2468
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2469
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
M
Matthew Wilcox 已提交
2470 2471 2472 2473
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

2474
	/* Optimisation for I/Os between 4k and 128k */
2475
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2476 2477 2478 2479 2480
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2481 2482 2483 2484 2485 2486
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2487
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2488 2489
}

2490
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2491
{
2492
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2493

2494
	nvme_dbbuf_dma_free(dev);
2495
	put_device(dev->dev);
2496 2497
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
2498 2499
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2500
	kfree(dev->queues);
2501
	free_opal_dev(dev->ctrl.opal_dev);
2502
	mempool_destroy(dev->iod_mempool);
2503 2504 2505
	kfree(dev);
}

2506 2507
static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
{
2508
	dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);
2509

2510
	nvme_get_ctrl(&dev->ctrl);
2511
	nvme_dev_disable(dev, false);
2512
	nvme_kill_queues(&dev->ctrl);
2513
	if (!queue_work(nvme_wq, &dev->remove_work))
2514 2515 2516
		nvme_put_ctrl(&dev->ctrl);
}

2517
static void nvme_reset_work(struct work_struct *work)
2518
{
2519 2520
	struct nvme_dev *dev =
		container_of(work, struct nvme_dev, ctrl.reset_work);
2521
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
2522
	int result = -ENODEV;
2523
	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
2524

2525
	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
2526
		goto out;
2527

2528 2529 2530 2531
	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
2532
	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
2533
		nvme_dev_disable(dev, false);
2534

2535
	/*
2536
	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
2537 2538
	 * initializing procedure here.
	 */
2539
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
2540
		dev_warn(dev->ctrl.device,
2541
			"failed to mark controller CONNECTING\n");
2542 2543 2544
		goto out;
	}

2545
	result = nvme_pci_enable(dev);
2546
	if (result)
2547
		goto out;
2548

2549
	result = nvme_pci_configure_admin_queue(dev);
2550
	if (result)
2551
		goto out;
2552

K
Keith Busch 已提交
2553 2554
	result = nvme_alloc_admin_tags(dev);
	if (result)
2555
		goto out;
2556

2557 2558 2559 2560 2561 2562 2563
	/*
	 * Limit the max command size to prevent iod->sg allocations going
	 * over a single page.
	 */
	dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;
	dev->ctrl.max_segments = NVME_MAX_SEGS;

2564 2565
	result = nvme_init_identify(&dev->ctrl);
	if (result)
2566
		goto out;
2567

2568 2569 2570 2571 2572 2573 2574 2575 2576
	if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
		if (!dev->ctrl.opal_dev)
			dev->ctrl.opal_dev =
				init_opal_dev(&dev->ctrl, &nvme_sec_submit);
		else if (was_suspend)
			opal_unlock_from_suspend(dev->ctrl.opal_dev);
	} else {
		free_opal_dev(dev->ctrl.opal_dev);
		dev->ctrl.opal_dev = NULL;
2577
	}
2578

2579 2580 2581 2582 2583 2584 2585
	if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
		result = nvme_dbbuf_dma_alloc(dev);
		if (result)
			dev_warn(dev->dev,
				 "unable to allocate dma for dbbuf\n");
	}

2586 2587 2588 2589 2590
	if (dev->ctrl.hmpre) {
		result = nvme_setup_host_mem(dev);
		if (result < 0)
			goto out;
	}
2591

2592
	result = nvme_setup_io_queues(dev);
2593
	if (result)
2594
		goto out;
2595

2596 2597 2598 2599
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2600
	if (dev->online_queues < 2) {
2601
		dev_warn(dev->ctrl.device, "IO queues not created\n");
2602
		nvme_kill_queues(&dev->ctrl);
2603
		nvme_remove_namespaces(&dev->ctrl);
2604
		new_state = NVME_CTRL_ADMIN_ONLY;
2605
	} else {
2606
		nvme_start_queues(&dev->ctrl);
K
Keith Busch 已提交
2607
		nvme_wait_freeze(&dev->ctrl);
2608 2609 2610
		/* hit this only when allocate tagset fails */
		if (nvme_dev_add(dev))
			new_state = NVME_CTRL_ADMIN_ONLY;
K
Keith Busch 已提交
2611
		nvme_unfreeze(&dev->ctrl);
2612 2613
	}

2614 2615 2616 2617 2618 2619 2620
	/*
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller state %d\n", new_state);
2621 2622
		goto out;
	}
2623

2624
	nvme_start_ctrl(&dev->ctrl);
2625
	return;
2626

2627
 out:
2628
	nvme_remove_dead_ctrl(dev, result);
2629 2630
}

2631
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2632
{
2633
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2634
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2635 2636

	if (pci_get_drvdata(pdev))
K
Keith Busch 已提交
2637
		device_release_driver(&pdev->dev);
2638
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2639 2640
}

2641
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
T
Tejun Heo 已提交
2642
{
2643
	*val = readl(to_nvme_dev(ctrl)->bar + off);
2644
	return 0;
T
Tejun Heo 已提交
2645 2646
}

2647
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
2648
{
2649 2650 2651
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}
2652

2653 2654 2655 2656
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
	*val = readq(to_nvme_dev(ctrl)->bar + off);
	return 0;
2657 2658
}

2659 2660 2661 2662 2663 2664 2665
static int nvme_pci_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
	struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);

	return snprintf(buf, size, "%s", dev_name(&pdev->dev));
}

2666
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
M
Ming Lin 已提交
2667
	.name			= "pcie",
2668
	.module			= THIS_MODULE,
2669 2670
	.flags			= NVME_F_METADATA_SUPPORTED |
				  NVME_F_PCI_P2PDMA,
2671
	.reg_read32		= nvme_pci_reg_read32,
2672
	.reg_write32		= nvme_pci_reg_write32,
2673
	.reg_read64		= nvme_pci_reg_read64,
2674
	.free_ctrl		= nvme_pci_free_ctrl,
2675
	.submit_async_event	= nvme_pci_submit_async_event,
2676
	.get_address		= nvme_pci_get_address,
2677
};
2678

2679 2680 2681 2682
static int nvme_dev_map(struct nvme_dev *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev->dev);

2683
	if (pci_request_mem_regions(pdev, "nvme"))
2684 2685
		return -ENODEV;

2686
	if (nvme_remap_bar(dev, NVME_REG_DBS + 4096))
2687 2688
		goto release;

M
Max Gurtovoy 已提交
2689
	return 0;
2690
  release:
M
Max Gurtovoy 已提交
2691 2692
	pci_release_mem_regions(pdev);
	return -ENODEV;
2693 2694
}

2695
static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
{
	if (pdev->vendor == 0x144d && pdev->device == 0xa802) {
		/*
		 * Several Samsung devices seem to drop off the PCIe bus
		 * randomly when APST is on and uses the deepest sleep state.
		 * This has been observed on a Samsung "SM951 NVMe SAMSUNG
		 * 256GB", a "PM951 NVMe SAMSUNG 512GB", and a "Samsung SSD
		 * 950 PRO 256GB", but it seems to be restricted to two Dell
		 * laptops.
		 */
		if (dmi_match(DMI_SYS_VENDOR, "Dell Inc.") &&
		    (dmi_match(DMI_PRODUCT_NAME, "XPS 15 9550") ||
		     dmi_match(DMI_PRODUCT_NAME, "Precision 5510")))
			return NVME_QUIRK_NO_DEEPEST_PS;
2710 2711 2712
	} else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
		/*
		 * Samsung SSD 960 EVO drops off the PCIe bus after system
2713 2714 2715
		 * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
		 * within few minutes after bootup on a Coffee Lake board -
		 * ASUS PRIME Z370-A
2716 2717
		 */
		if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
2718 2719
		    (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
		     dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
2720
			return NVME_QUIRK_NO_APST;
2721 2722 2723 2724 2725
	}

	return 0;
}

2726 2727 2728
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
	struct nvme_dev *dev = data;
2729

2730 2731
	nvme_reset_ctrl_sync(&dev->ctrl);
	flush_work(&dev->ctrl.scan_work);
2732
	nvme_put_ctrl(&dev->ctrl);
2733 2734
}

2735
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2736
{
M
Matias Bjørling 已提交
2737
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2738
	struct nvme_dev *dev;
2739
	unsigned long quirks = id->driver_data;
2740
	size_t alloc_size;
M
Matthew Wilcox 已提交
2741

M
Matias Bjørling 已提交
2742 2743
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
2744
		set_dev_node(&pdev->dev, first_memory_node);
M
Matias Bjørling 已提交
2745 2746

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2747 2748
	if (!dev)
		return -ENOMEM;
2749

2750 2751
	dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
					GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2752 2753 2754
	if (!dev->queues)
		goto free;

2755
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2756
	pci_set_drvdata(pdev, dev);
2757

2758 2759
	result = nvme_dev_map(dev);
	if (result)
2760
		goto put_pci;
2761

2762
	INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
2763
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2764
	mutex_init(&dev->shutdown_lock);
M
Matthew Wilcox 已提交
2765

M
Matthew Wilcox 已提交
2766 2767
	result = nvme_setup_prp_pools(dev);
	if (result)
2768
		goto unmap;
2769

2770
	quirks |= check_vendor_combination_bug(pdev);
2771

2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788
	/*
	 * Double check that our mempool alloc size will cover the biggest
	 * command we support.
	 */
	alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
						NVME_MAX_SEGS, true);
	WARN_ON_ONCE(alloc_size > PAGE_SIZE);

	dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
						mempool_kfree,
						(void *) alloc_size,
						GFP_KERNEL, node);
	if (!dev->iod_mempool) {
		result = -ENOMEM;
		goto release_pools;
	}

2789 2790 2791 2792 2793
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			quirks);
	if (result)
		goto release_mempool;

2794 2795
	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));

2796
	nvme_get_ctrl(&dev->ctrl);
2797
	async_schedule(nvme_async_probe, dev);
2798

M
Matthew Wilcox 已提交
2799 2800
	return 0;

2801 2802
 release_mempool:
	mempool_destroy(dev->iod_mempool);
2803
 release_pools:
M
Matthew Wilcox 已提交
2804
	nvme_release_prp_pools(dev);
2805 2806
 unmap:
	nvme_dev_unmap(dev);
K
Keith Busch 已提交
2807
 put_pci:
2808
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2809 2810 2811 2812 2813 2814
 free:
	kfree(dev->queues);
	kfree(dev);
	return result;
}

2815
static void nvme_reset_prepare(struct pci_dev *pdev)
2816
{
K
Keith Busch 已提交
2817
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2818
	nvme_dev_disable(dev, false);
2819
}
2820

2821 2822
static void nvme_reset_done(struct pci_dev *pdev)
{
2823
	struct nvme_dev *dev = pci_get_drvdata(pdev);
S
Sagi Grimberg 已提交
2824
	nvme_reset_ctrl_sync(&dev->ctrl);
2825 2826
}

2827 2828 2829
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2830
	nvme_dev_disable(dev, true);
2831 2832
}

2833 2834 2835 2836 2837
/*
 * The driver's remove may be called on a device in a partially initialized
 * state. This function must not have any dependencies on the device state in
 * order to proceed.
 */
2838
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2839 2840
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2841

2842
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
K
Keith Busch 已提交
2843
	pci_set_drvdata(pdev, NULL);
2844

2845
	if (!pci_device_is_present(pdev)) {
2846
		nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
2847
		nvme_dev_disable(dev, true);
2848
		nvme_dev_remove_admin(dev);
2849
	}
2850

2851
	flush_work(&dev->ctrl.reset_work);
2852 2853
	nvme_stop_ctrl(&dev->ctrl);
	nvme_remove_namespaces(&dev->ctrl);
2854
	nvme_dev_disable(dev, true);
2855
	nvme_release_cmb(dev);
2856
	nvme_free_host_mem(dev);
M
Matias Bjørling 已提交
2857
	nvme_dev_remove_admin(dev);
2858
	nvme_free_queues(dev, 0);
2859
	nvme_uninit_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2860
	nvme_release_prp_pools(dev);
2861
	nvme_dev_unmap(dev);
2862
	nvme_put_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2863 2864
}

2865
#ifdef CONFIG_PM_SLEEP
2866 2867 2868 2869 2870
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2871
	nvme_dev_disable(ndev, true);
2872 2873 2874 2875 2876 2877 2878 2879
	return 0;
}

static int nvme_resume(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2880
	nvme_reset_ctrl(&ndev->ctrl);
K
Keith Busch 已提交
2881
	return 0;
2882
}
2883
#endif
2884 2885

static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
M
Matthew Wilcox 已提交
2886

K
Keith Busch 已提交
2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
						pci_channel_state_t state)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	/*
	 * A frozen channel requires a reset. When detected, this method will
	 * shutdown the controller to quiesce. The controller will be restarted
	 * after the slot reset through driver's slot_reset callback.
	 */
	switch (state) {
	case pci_channel_io_normal:
		return PCI_ERS_RESULT_CAN_RECOVER;
	case pci_channel_io_frozen:
K
Keith Busch 已提交
2901 2902
		dev_warn(dev->ctrl.device,
			"frozen state error detected, reset controller\n");
2903
		nvme_dev_disable(dev, false);
K
Keith Busch 已提交
2904 2905
		return PCI_ERS_RESULT_NEED_RESET;
	case pci_channel_io_perm_failure:
K
Keith Busch 已提交
2906 2907
		dev_warn(dev->ctrl.device,
			"failure state error detected, request disconnect\n");
K
Keith Busch 已提交
2908 2909 2910 2911 2912 2913 2914 2915 2916
		return PCI_ERS_RESULT_DISCONNECT;
	}
	return PCI_ERS_RESULT_NEED_RESET;
}

static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);

2917
	dev_info(dev->ctrl.device, "restart after slot reset\n");
K
Keith Busch 已提交
2918
	pci_restore_state(pdev);
2919
	nvme_reset_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2920 2921 2922 2923 2924
	return PCI_ERS_RESULT_RECOVERED;
}

static void nvme_error_resume(struct pci_dev *pdev)
{
K
Keith Busch 已提交
2925 2926 2927
	struct nvme_dev *dev = pci_get_drvdata(pdev);

	flush_work(&dev->ctrl.reset_work);
K
Keith Busch 已提交
2928 2929
}

2930
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
2931 2932 2933
	.error_detected	= nvme_error_detected,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
2934 2935
	.reset_prepare	= nvme_reset_prepare,
	.reset_done	= nvme_reset_done,
M
Matthew Wilcox 已提交
2936 2937
};

2938
static const struct pci_device_id nvme_id_table[] = {
2939
	{ PCI_VDEVICE(INTEL, 0x0953),
2940
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2941
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2942 2943
	{ PCI_VDEVICE(INTEL, 0x0a53),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2944
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2945 2946
	{ PCI_VDEVICE(INTEL, 0x0a54),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
2947
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2948 2949 2950
	{ PCI_VDEVICE(INTEL, 0x0a55),
		.driver_data = NVME_QUIRK_STRIPE_SIZE |
				NVME_QUIRK_DEALLOCATE_ZEROES, },
2951
	{ PCI_VDEVICE(INTEL, 0xf1a5),	/* Intel 600P/P3100 */
2952 2953
		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
				NVME_QUIRK_MEDIUM_PRIO_SQ },
2954 2955
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
2956 2957
	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2958 2959
	{ PCI_DEVICE(0x1c58, 0x0003),	/* HGST adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2960 2961
	{ PCI_DEVICE(0x1c58, 0x0023),	/* WDC SN200 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2962 2963
	{ PCI_DEVICE(0x1c5f, 0x0540),	/* Memblaze Pblaze4 adapter */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
2964 2965 2966 2967
	{ PCI_DEVICE(0x144d, 0xa821),   /* Samsung PM1725 */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
	{ PCI_DEVICE(0x144d, 0xa822),   /* Samsung PM1725a */
		.driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, },
C
Christoph Hellwig 已提交
2968 2969 2970 2971
	{ PCI_DEVICE(0x1d1d, 0x1f1f),	/* LighNVM qemu device */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
	{ PCI_DEVICE(0x1d1d, 0x2807),	/* CNEX WL */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
W
Wei Xu 已提交
2972 2973
	{ PCI_DEVICE(0x1d1d, 0x2601),	/* CNEX Granby */
		.driver_data = NVME_QUIRK_LIGHTNVM, },
M
Matthew Wilcox 已提交
2974
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
2975
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
2976
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
M
Matthew Wilcox 已提交
2977 2978 2979 2980 2981 2982 2983 2984
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
2985
	.remove		= nvme_remove,
2986
	.shutdown	= nvme_shutdown,
2987 2988 2989
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
2990
	.sriov_configure = pci_sriov_configure_simple,
M
Matthew Wilcox 已提交
2991 2992 2993 2994 2995
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
2996
	return pci_register_driver(&nvme_driver);
M
Matthew Wilcox 已提交
2997 2998 2999 3000 3001
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
3002
	flush_workqueue(nvme_wq);
3003
	_nvme_check_size();
M
Matthew Wilcox 已提交
3004 3005 3006 3007
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
3008
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
3009 3010
module_init(nvme_init);
module_exit(nvme_exit);