pci.c 57.4 KB
Newer Older
M
Matthew Wilcox 已提交
1 2
/*
 * NVM Express device driver
3
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
4 5 6 7 8 9 10 11 12 13 14
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

15
#include <linux/bitops.h>
M
Matthew Wilcox 已提交
16
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
17
#include <linux/blk-mq.h>
K
Keith Busch 已提交
18
#include <linux/cpu.h>
19
#include <linux/delay.h>
M
Matthew Wilcox 已提交
20 21 22
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/genhd.h>
K
Keith Busch 已提交
23
#include <linux/hdreg.h>
24
#include <linux/idr.h>
M
Matthew Wilcox 已提交
25 26 27 28
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kdev_t.h>
29
#include <linux/kthread.h>
M
Matthew Wilcox 已提交
30 31 32 33
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
34
#include <linux/mutex.h>
M
Matthew Wilcox 已提交
35
#include <linux/pci.h>
36
#include <linux/poison.h>
37
#include <linux/ptrace.h>
M
Matthew Wilcox 已提交
38 39
#include <linux/sched.h>
#include <linux/slab.h>
K
Keith Busch 已提交
40
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
41
#include <linux/types.h>
42
#include <linux/io-64-nonatomic-lo-hi.h>
K
Keith Busch 已提交
43
#include <asm/unaligned.h>
44

45 46
#include "nvme.h"

47
#define NVME_Q_DEPTH		1024
J
Jens Axboe 已提交
48
#define NVME_AQ_DEPTH		256
M
Matthew Wilcox 已提交
49 50
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
51 52 53 54 55 56 57
		
/*
 * We handle AEN commands ourselves and don't even let the
 * block layer know about them.
 */
#define NVME_NR_AEN_COMMANDS	1
#define NVME_AQ_BLKMQ_DEPTH	(NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS)
58

59
unsigned char admin_timeout = 60;
60 61
module_param(admin_timeout, byte, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
M
Matthew Wilcox 已提交
62

63 64
unsigned char nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
65
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
M
Matthew Wilcox 已提交
66

67
unsigned char shutdown_timeout = 5;
68 69 70
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");

71 72 73
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

74 75 76 77
static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0644);
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

78 79
static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
K
Keith Busch 已提交
80
static struct workqueue_struct *nvme_workq;
81
static wait_queue_head_t nvme_kthread_wait;
82

83 84 85
struct nvme_dev;
struct nvme_queue;

86
static int nvme_reset(struct nvme_dev *dev);
J
Jens Axboe 已提交
87
static void nvme_process_cq(struct nvme_queue *nvmeq);
88
static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
89
static void nvme_dev_shutdown(struct nvme_dev *dev);
90

K
Keith Busch 已提交
91 92 93 94 95 96
struct async_cmd_info {
	struct kthread_work work;
	struct kthread_worker *worker;
	int status;
	void *ctx;
};
97

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
	struct list_head node;
	struct nvme_queue **queues;
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned queue_count;
	unsigned online_queues;
	unsigned max_qid;
	int q_depth;
	u32 db_stride;
	struct msix_entry *entry;
	void __iomem *bar;
	struct work_struct reset_work;
	struct work_struct scan_work;
119
	struct work_struct remove_work;
120
	struct mutex shutdown_lock;
121 122 123 124 125
	bool subsystem;
	void __iomem *cmb;
	dma_addr_t cmb_dma_addr;
	u64 cmb_size;
	u32 cmbsz;
126 127
	unsigned long flags;
#define NVME_CTRL_RESETTING    0
128 129 130 131 132 133 134 135 136

	struct nvme_ctrl ctrl;
};

static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
137 138 139 140 141 142
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
	struct device *q_dmadev;
M
Matthew Wilcox 已提交
143
	struct nvme_dev *dev;
144
	char irqname[24];	/* nvme4294967295-65535\0 */
M
Matthew Wilcox 已提交
145 146
	spinlock_t q_lock;
	struct nvme_command *sq_cmds;
147
	struct nvme_command __iomem *sq_cmds_io;
M
Matthew Wilcox 已提交
148
	volatile struct nvme_completion *cqes;
149
	struct blk_mq_tags **tags;
M
Matthew Wilcox 已提交
150 151 152 153
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
J
Jens Axboe 已提交
154
	s16 cq_vector;
M
Matthew Wilcox 已提交
155 156 157
	u16 sq_head;
	u16 sq_tail;
	u16 cq_head;
K
Keith Busch 已提交
158
	u16 qid;
159 160
	u8 cq_phase;
	u8 cqe_seen;
K
Keith Busch 已提交
161
	struct async_cmd_info cmdinfo;
M
Matthew Wilcox 已提交
162 163
};

164 165 166
/*
 * The nvme_iod describes the data in an I/O, including the list of PRP
 * entries.  You can't see it in this data structure because C doesn't let
C
Christoph Hellwig 已提交
167
 * me express that.  Use nvme_init_iod to ensure there's enough space
168 169 170
 * allocated to store the PRP list.
 */
struct nvme_iod {
C
Christoph Hellwig 已提交
171 172
	struct nvme_queue *nvmeq;
	int aborted;
173 174 175 176
	int npages;		/* In the PRP list. 0 means small pool in use */
	int nents;		/* Used in scatterlist */
	int length;		/* Of data, in bytes */
	dma_addr_t first_dma;
177
	struct scatterlist meta_sg; /* metadata requires single contiguous buffer */
C
Christoph Hellwig 已提交
178 179
	struct scatterlist *sg;
	struct scatterlist inline_sg[0];
180 181
};

M
Matthew Wilcox 已提交
182 183 184 185 186 187 188 189 190 191
/*
 * Check we didin't inadvertently grow the command struct
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
192
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
K
Keith Busch 已提交
193
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
M
Matthew Wilcox 已提交
194 195 196 197
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
K
Keith Busch 已提交
198
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
M
Matthew Wilcox 已提交
199 200
}

201 202 203 204
/*
 * Max size of iod being embedded in the request payload
 */
#define NVME_INT_PAGES		2
205
#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
206 207 208 209 210 211 212 213

/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
214 215
	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				      dev->ctrl.page_size);
216 217 218
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

C
Christoph Hellwig 已提交
219 220
static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev,
		unsigned int size, unsigned int nseg)
221
{
C
Christoph Hellwig 已提交
222 223 224
	return sizeof(__le64 *) * nvme_npages(size, dev) +
			sizeof(struct scatterlist) * nseg;
}
225

C
Christoph Hellwig 已提交
226 227 228 229
static unsigned int nvme_cmd_size(struct nvme_dev *dev)
{
	return sizeof(struct nvme_iod) +
		nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
230 231
}

M
Matias Bjørling 已提交
232 233
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
234
{
M
Matias Bjørling 已提交
235 236 237
	struct nvme_dev *dev = data;
	struct nvme_queue *nvmeq = dev->queues[0];

238 239 240 241
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
	WARN_ON(nvmeq->tags);

M
Matias Bjørling 已提交
242
	hctx->driver_data = nvmeq;
243
	nvmeq->tags = &dev->admin_tagset.tags[0];
M
Matias Bjørling 已提交
244
	return 0;
245 246
}

247 248 249 250 251 252 253
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	nvmeq->tags = NULL;
}

M
Matias Bjørling 已提交
254 255 256
static int nvme_admin_init_request(void *data, struct request *req,
				unsigned int hctx_idx, unsigned int rq_idx,
				unsigned int numa_node)
257
{
M
Matias Bjørling 已提交
258
	struct nvme_dev *dev = data;
C
Christoph Hellwig 已提交
259
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matias Bjørling 已提交
260 261 262
	struct nvme_queue *nvmeq = dev->queues[0];

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
263
	iod->nvmeq = nvmeq;
M
Matias Bjørling 已提交
264
	return 0;
265 266
}

M
Matias Bjørling 已提交
267 268
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
269
{
M
Matias Bjørling 已提交
270
	struct nvme_dev *dev = data;
271
	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
272

273 274
	if (!nvmeq->tags)
		nvmeq->tags = &dev->tagset.tags[hctx_idx];
M
Matthew Wilcox 已提交
275

276
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
277 278
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
279 280
}

M
Matias Bjørling 已提交
281 282 283
static int nvme_init_request(void *data, struct request *req,
				unsigned int hctx_idx, unsigned int rq_idx,
				unsigned int numa_node)
M
Matthew Wilcox 已提交
284
{
M
Matias Bjørling 已提交
285
	struct nvme_dev *dev = data;
C
Christoph Hellwig 已提交
286
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
M
Matias Bjørling 已提交
287 288 289
	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];

	BUG_ON(!nvmeq);
C
Christoph Hellwig 已提交
290
	iod->nvmeq = nvmeq;
M
Matias Bjørling 已提交
291 292 293
	return 0;
}

294 295
static void nvme_complete_async_event(struct nvme_dev *dev,
		struct nvme_completion *cqe)
296
{
297 298
	u16 status = le16_to_cpu(cqe->status) >> 1;
	u32 result = le32_to_cpu(cqe->result);
M
Matias Bjørling 已提交
299 300

	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
301
		++dev->ctrl.event_limit;
K
Keith Busch 已提交
302 303 304 305 306
	if (status != NVME_SC_SUCCESS)
		return;

	switch (result & 0xff07) {
	case NVME_AER_NOTICE_NS_CHANGED:
307 308
		dev_info(dev->dev, "rescanning\n");
		queue_work(nvme_workq, &dev->scan_work);
K
Keith Busch 已提交
309
	default:
310
		dev_warn(dev->dev, "async event result %08x\n", result);
K
Keith Busch 已提交
311
	}
M
Matthew Wilcox 已提交
312 313 314
}

/**
315
 * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
316 317 318 319 320
 * @nvmeq: The queue to use
 * @cmd: The command to send
 *
 * Safe to use from interrupt context
 */
321 322
static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
						struct nvme_command *cmd)
M
Matthew Wilcox 已提交
323
{
M
Matias Bjørling 已提交
324 325
	u16 tail = nvmeq->sq_tail;

326 327 328 329 330
	if (nvmeq->sq_cmds_io)
		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
	else
		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));

M
Matthew Wilcox 已提交
331 332
	if (++tail == nvmeq->q_depth)
		tail = 0;
333
	writel(tail, nvmeq->q_db);
M
Matthew Wilcox 已提交
334 335 336
	nvmeq->sq_tail = tail;
}

C
Christoph Hellwig 已提交
337
static __le64 **iod_list(struct request *req)
M
Matthew Wilcox 已提交
338
{
C
Christoph Hellwig 已提交
339 340
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	return (__le64 **)(iod->sg + req->nr_phys_segments);
M
Matthew Wilcox 已提交
341 342
}

C
Christoph Hellwig 已提交
343
static int nvme_init_iod(struct request *rq, struct nvme_dev *dev)
344
{
C
Christoph Hellwig 已提交
345 346 347
	struct nvme_iod *iod = blk_mq_rq_to_pdu(rq);
	int nseg = rq->nr_phys_segments;
	unsigned size;
348

C
Christoph Hellwig 已提交
349 350 351 352
	if (rq->cmd_flags & REQ_DISCARD)
		size = sizeof(struct nvme_dsm_range);
	else
		size = blk_rq_bytes(rq);
353

C
Christoph Hellwig 已提交
354 355 356 357 358 359
	if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) {
		iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC);
		if (!iod->sg)
			return BLK_MQ_RQ_QUEUE_BUSY;
	} else {
		iod->sg = iod->inline_sg;
360 361
	}

C
Christoph Hellwig 已提交
362 363 364 365 366
	iod->aborted = 0;
	iod->npages = -1;
	iod->nents = 0;
	iod->length = size;
	return 0;
367 368
}

C
Christoph Hellwig 已提交
369
static void nvme_free_iod(struct nvme_dev *dev, struct request *req)
M
Matthew Wilcox 已提交
370
{
C
Christoph Hellwig 已提交
371
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
372
	const int last_prp = dev->ctrl.page_size / 8 - 1;
373
	int i;
C
Christoph Hellwig 已提交
374
	__le64 **list = iod_list(req);
375 376 377 378 379 380 381 382 383 384
	dma_addr_t prp_dma = iod->first_dma;

	if (iod->npages == 0)
		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
	for (i = 0; i < iod->npages; i++) {
		__le64 *prp_list = list[i];
		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
		prp_dma = next_prp_dma;
	}
385

C
Christoph Hellwig 已提交
386 387
	if (iod->sg != iod->inline_sg)
		kfree(iod->sg);
M
Matthew Wilcox 已提交
388 389
}

390
#ifdef CONFIG_BLK_DEV_INTEGRITY
K
Keith Busch 已提交
391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
	if (be32_to_cpu(pi->ref_tag) == v)
		pi->ref_tag = cpu_to_be32(p);
}

static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
	if (be32_to_cpu(pi->ref_tag) == p)
		pi->ref_tag = cpu_to_be32(v);
}

/**
 * nvme_dif_remap - remaps ref tags to bip seed and physical lba
 *
 * The virtual start sector is the one that was originally submitted by the
 * block layer.	Due to partitioning, MD/DM cloning, etc. the actual physical
 * start sector may be different. Remap protection information to match the
 * physical LBA on writes, and back to the original seed on reads.
 *
 * Type 0 and 3 do not have a ref tag, so no remapping required.
 */
static void nvme_dif_remap(struct request *req,
			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
	struct nvme_ns *ns = req->rq_disk->private_data;
	struct bio_integrity_payload *bip;
	struct t10_pi_tuple *pi;
	void *p, *pmap;
	u32 i, nlb, ts, phys, virt;

	if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
		return;

	bip = bio_integrity(req->bio);
	if (!bip)
		return;

	pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;

	p = pmap;
	virt = bip_get_seed(bip);
	phys = nvme_block_nr(ns, blk_rq_pos(req));
	nlb = (blk_rq_bytes(req) >> ns->lba_shift);
435
	ts = ns->disk->queue->integrity.tuple_size;
K
Keith Busch 已提交
436 437 438 439 440 441 442 443

	for (i = 0; i < nlb; i++, virt++, phys++) {
		pi = (struct t10_pi_tuple *)p;
		dif_swap(phys, virt, pi);
		p += ts;
	}
	kunmap_atomic(pmap);
}
444 445 446 447 448 449 450 451 452 453 454 455 456
#else /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_dif_remap(struct request *req,
			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
}
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
#endif

C
Christoph Hellwig 已提交
457
static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req,
458
		int total_len)
M
Matthew Wilcox 已提交
459
{
C
Christoph Hellwig 已提交
460
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
461
	struct dma_pool *pool;
462 463
	int length = total_len;
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
464 465
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
466
	u32 page_size = dev->ctrl.page_size;
467
	int offset = dma_addr & (page_size - 1);
468
	__le64 *prp_list;
C
Christoph Hellwig 已提交
469
	__le64 **list = iod_list(req);
470
	dma_addr_t prp_dma;
471
	int nprps, i;
M
Matthew Wilcox 已提交
472

473
	length -= (page_size - offset);
M
Matthew Wilcox 已提交
474
	if (length <= 0)
475
		return true;
M
Matthew Wilcox 已提交
476

477
	dma_len -= (page_size - offset);
M
Matthew Wilcox 已提交
478
	if (dma_len) {
479
		dma_addr += (page_size - offset);
M
Matthew Wilcox 已提交
480 481 482 483 484 485
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

486
	if (length <= page_size) {
487
		iod->first_dma = dma_addr;
488
		return true;
489 490
	}

491
	nprps = DIV_ROUND_UP(length, page_size);
492 493
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
494
		iod->npages = 0;
495 496
	} else {
		pool = dev->prp_page_pool;
497
		iod->npages = 1;
498 499
	}

500
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
501
	if (!prp_list) {
502
		iod->first_dma = dma_addr;
503
		iod->npages = -1;
504
		return false;
505
	}
506 507
	list[0] = prp_list;
	iod->first_dma = prp_dma;
508 509
	i = 0;
	for (;;) {
510
		if (i == page_size >> 3) {
511
			__le64 *old_prp_list = prp_list;
512
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
513
			if (!prp_list)
514
				return false;
515
			list[iod->npages++] = prp_list;
516 517 518
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
519 520
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
521 522 523
		dma_len -= page_size;
		dma_addr += page_size;
		length -= page_size;
524 525 526 527 528 529 530 531
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
		BUG_ON(dma_len < 0);
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
532 533
	}

534
	return true;
M
Matthew Wilcox 已提交
535 536
}

C
Christoph Hellwig 已提交
537
static int nvme_map_data(struct nvme_dev *dev, struct request *req,
C
Christoph Hellwig 已提交
538
		struct nvme_command *cmnd)
539
{
C
Christoph Hellwig 已提交
540
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
541 542 543 544 545 546 547 548 549 550 551 552 553 554
	struct request_queue *q = req->q;
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;
	int ret = BLK_MQ_RQ_QUEUE_ERROR;

	sg_init_table(iod->sg, req->nr_phys_segments);
	iod->nents = blk_rq_map_sg(q, req, iod->sg);
	if (!iod->nents)
		goto out;

	ret = BLK_MQ_RQ_QUEUE_BUSY;
	if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
		goto out;

C
Christoph Hellwig 已提交
555
	if (!nvme_setup_prps(dev, req, blk_rq_bytes(req)))
C
Christoph Hellwig 已提交
556 557 558 559 560 561 562
		goto out_unmap;

	ret = BLK_MQ_RQ_QUEUE_ERROR;
	if (blk_integrity_rq(req)) {
		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
			goto out_unmap;

563 564
		sg_init_table(&iod->meta_sg, 1);
		if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1)
C
Christoph Hellwig 已提交
565
			goto out_unmap;
566

C
Christoph Hellwig 已提交
567 568 569
		if (rq_data_dir(req))
			nvme_dif_remap(req, nvme_dif_prep);

570
		if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir))
C
Christoph Hellwig 已提交
571
			goto out_unmap;
572 573
	}

C
Christoph Hellwig 已提交
574 575 576
	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
	if (blk_integrity_rq(req))
577
		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg));
C
Christoph Hellwig 已提交
578 579 580 581 582 583
	return BLK_MQ_RQ_QUEUE_OK;

out_unmap:
	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
	return ret;
584 585
}

C
Christoph Hellwig 已提交
586
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
587
{
C
Christoph Hellwig 已提交
588
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
589 590 591 592 593 594 595 596
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;

	if (iod->nents) {
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
		if (blk_integrity_rq(req)) {
			if (!rq_data_dir(req))
				nvme_dif_remap(req, nvme_dif_complete);
597
			dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir);
598 599 600
		}
	}

C
Christoph Hellwig 已提交
601
	nvme_free_iod(dev, req);
602 603
}

M
Matias Bjørling 已提交
604 605 606 607 608
/*
 * We reuse the small pool to allocate the 16-byte range here as it is not
 * worth having a special pool for these or additional cases to handle freeing
 * the iod.
 */
C
Christoph Hellwig 已提交
609
static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
C
Christoph Hellwig 已提交
610
		struct request *req, struct nvme_command *cmnd)
611
{
C
Christoph Hellwig 已提交
612
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
C
Christoph Hellwig 已提交
613 614 615 616 617 618
	struct nvme_dsm_range *range;

	range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
						&iod->first_dma);
	if (!range)
		return BLK_MQ_RQ_QUEUE_BUSY;
C
Christoph Hellwig 已提交
619
	iod_list(req)[0] = (__le64 *)range;
C
Christoph Hellwig 已提交
620
	iod->npages = 0;
621 622

	range->cattr = cpu_to_le32(0);
M
Matias Bjørling 已提交
623 624
	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
625

C
Christoph Hellwig 已提交
626 627 628 629 630 631 632
	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->dsm.opcode = nvme_cmd_dsm;
	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
	cmnd->dsm.nr = 0;
	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
	return BLK_MQ_RQ_QUEUE_OK;
633 634
}

635 636 637
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
M
Matias Bjørling 已提交
638 639
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
			 const struct blk_mq_queue_data *bd)
640
{
M
Matias Bjørling 已提交
641 642
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
643
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
644
	struct request *req = bd->rq;
C
Christoph Hellwig 已提交
645 646
	struct nvme_command cmnd;
	int ret = BLK_MQ_RQ_QUEUE_OK;
647

K
Keith Busch 已提交
648 649 650 651 652
	/*
	 * If formated with metadata, require the block layer provide a buffer
	 * unless this namespace is formated such that the metadata can be
	 * stripped/generated by the controller with PRACT=1.
	 */
653
	if (ns && ns->ms && !blk_integrity_rq(req)) {
654 655
		if (!(ns->pi_type && ns->ms == 8) &&
					req->cmd_type != REQ_TYPE_DRV_PRIV) {
656
			blk_mq_end_request(req, -EFAULT);
K
Keith Busch 已提交
657 658 659 660
			return BLK_MQ_RQ_QUEUE_OK;
		}
	}

C
Christoph Hellwig 已提交
661 662 663
	ret = nvme_init_iod(req, dev);
	if (ret)
		return ret;
M
Matias Bjørling 已提交
664 665

	if (req->cmd_flags & REQ_DISCARD) {
C
Christoph Hellwig 已提交
666
		ret = nvme_setup_discard(nvmeq, ns, req, &cmnd);
C
Christoph Hellwig 已提交
667 668 669 670 671 672 673
	} else {
		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
			memcpy(&cmnd, req->cmd, sizeof(cmnd));
		else if (req->cmd_flags & REQ_FLUSH)
			nvme_setup_flush(ns, &cmnd);
		else
			nvme_setup_rw(ns, req, &cmnd);
M
Matias Bjørling 已提交
674

C
Christoph Hellwig 已提交
675
		if (req->nr_phys_segments)
C
Christoph Hellwig 已提交
676
			ret = nvme_map_data(dev, req, &cmnd);
677
	}
678

C
Christoph Hellwig 已提交
679 680 681 682
	if (ret)
		goto out;

	cmnd.common.command_id = req->tag;
683
	blk_mq_start_request(req);
M
Matias Bjørling 已提交
684

C
Christoph Hellwig 已提交
685 686
	spin_lock_irq(&nvmeq->q_lock);
	__nvme_submit_cmd(nvmeq, &cmnd);
M
Matias Bjørling 已提交
687 688 689
	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
	return BLK_MQ_RQ_QUEUE_OK;
C
Christoph Hellwig 已提交
690
out:
C
Christoph Hellwig 已提交
691
	nvme_free_iod(dev, req);
C
Christoph Hellwig 已提交
692
	return ret;
M
Matthew Wilcox 已提交
693 694
}

695 696
static void nvme_complete_rq(struct request *req)
{
C
Christoph Hellwig 已提交
697 698
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_dev *dev = iod->nvmeq->dev;
699 700
	int error = 0;

C
Christoph Hellwig 已提交
701
	nvme_unmap_data(dev, req);
702 703 704 705 706 707 708 709 710 711 712 713 714

	if (unlikely(req->errors)) {
		if (nvme_req_needs_retry(req, req->errors)) {
			nvme_requeue_req(req);
			return;
		}

		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
			error = req->errors;
		else
			error = nvme_error_status(req->errors);
	}

C
Christoph Hellwig 已提交
715
	if (unlikely(iod->aborted)) {
716 717 718 719 720 721 722 723
		dev_warn(dev->dev,
			"completing aborted command with status: %04x\n",
			req->errors);
	}

	blk_mq_end_request(req, error);
}

J
Jens Axboe 已提交
724
static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
M
Matthew Wilcox 已提交
725
{
M
Matthew Wilcox 已提交
726
	u16 head, phase;
M
Matthew Wilcox 已提交
727 728

	head = nvmeq->cq_head;
M
Matthew Wilcox 已提交
729
	phase = nvmeq->cq_phase;
M
Matthew Wilcox 已提交
730 731 732

	for (;;) {
		struct nvme_completion cqe = nvmeq->cqes[head];
733
		u16 status = le16_to_cpu(cqe.status);
734
		struct request *req;
735 736

		if ((status & 1) != phase)
M
Matthew Wilcox 已提交
737 738 739 740
			break;
		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
		if (++head == nvmeq->q_depth) {
			head = 0;
M
Matthew Wilcox 已提交
741
			phase = !phase;
M
Matthew Wilcox 已提交
742
		}
743

J
Jens Axboe 已提交
744 745
		if (tag && *tag == cqe.command_id)
			*tag = -1;
746

747 748 749 750 751 752 753
		if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
			dev_warn(nvmeq->q_dmadev,
				"invalid id %d completed on queue %d\n",
				cqe.command_id, le16_to_cpu(cqe.sq_id));
			continue;
		}

754 755 756 757 758 759 760 761 762 763 764 765
		/*
		 * AEN requests are special as they don't time out and can
		 * survive any kind of queue freeze and often don't respond to
		 * aborts.  We don't even bother to allocate a struct request
		 * for them but rather special case them here.
		 */
		if (unlikely(nvmeq->qid == 0 &&
				cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
			nvme_complete_async_event(nvmeq->dev, &cqe);
			continue;
		}

766 767 768 769 770 771 772
		req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
			u32 result = le32_to_cpu(cqe.result);
			req->special = (void *)(uintptr_t)result;
		}
		blk_mq_complete_request(req, status >> 1);

M
Matthew Wilcox 已提交
773 774 775 776 777 778 779 780
	}

	/* If the controller ignores the cq head doorbell and continuously
	 * writes to the queue, it is theoretically possible to wrap around
	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
	 * requires that 0.1% of your interrupts are handled, so this isn't
	 * a big problem.
	 */
M
Matthew Wilcox 已提交
781
	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
J
Jens Axboe 已提交
782
		return;
M
Matthew Wilcox 已提交
783

784 785
	if (likely(nvmeq->cq_vector >= 0))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
M
Matthew Wilcox 已提交
786
	nvmeq->cq_head = head;
M
Matthew Wilcox 已提交
787
	nvmeq->cq_phase = phase;
M
Matthew Wilcox 已提交
788

789
	nvmeq->cqe_seen = 1;
J
Jens Axboe 已提交
790 791 792 793 794
}

static void nvme_process_cq(struct nvme_queue *nvmeq)
{
	__nvme_process_cq(nvmeq, NULL);
M
Matthew Wilcox 已提交
795 796 797
}

static irqreturn_t nvme_irq(int irq, void *data)
798 799 800 801
{
	irqreturn_t result;
	struct nvme_queue *nvmeq = data;
	spin_lock(&nvmeq->q_lock);
802 803 804
	nvme_process_cq(nvmeq);
	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
	nvmeq->cqe_seen = 0;
805 806 807 808 809 810 811 812 813 814 815 816 817
	spin_unlock(&nvmeq->q_lock);
	return result;
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
		return IRQ_NONE;
	return IRQ_WAKE_THREAD;
}

J
Jens Axboe 已提交
818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
	    nvmeq->cq_phase) {
		spin_lock_irq(&nvmeq->q_lock);
		__nvme_process_cq(nvmeq, &tag);
		spin_unlock_irq(&nvmeq->q_lock);

		if (tag == -1)
			return 1;
	}

	return 0;
}

835
static void nvme_submit_async_event(struct nvme_dev *dev)
M
Matias Bjørling 已提交
836 837 838 839 840
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
841
	c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit;
M
Matias Bjørling 已提交
842

843
	__nvme_submit_cmd(dev->queues[0], &c);
M
Matias Bjørling 已提交
844 845
}

846
static void async_cmd_info_endio(struct request *req, int error)
K
Keith Busch 已提交
847
{
848
	struct async_cmd_info *cmdinfo = req->end_io_data;
M
Matias Bjørling 已提交
849

850 851 852
	cmdinfo->status = req->errors;
	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
	blk_mq_free_request(req);
K
Keith Busch 已提交
853 854
}

M
Matthew Wilcox 已提交
855 856 857 858 859 860 861 862
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

863
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
864 865 866 867 868 869 870 871
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	struct nvme_command c;
	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;

872 873 874 875
	/*
	 * Note: we (ab)use the fact the the prp fields survive if no data
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
876 877 878 879 880 881 882 883
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);

884
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
885 886 887 888 889 890 891 892
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	struct nvme_command c;
	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;

893 894 895 896
	/*
	 * Note: we (ab)use the fact the the prp fields survive if no data
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
897 898 899 900 901 902 903 904
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

905
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
906 907 908 909 910 911 912 913 914 915 916 917
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

918 919
static void abort_endio(struct request *req, int error)
{
C
Christoph Hellwig 已提交
920 921
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
922 923 924 925 926 927 928 929 930
	u32 result = (u32)(uintptr_t)req->special;
	u16 status = req->errors;

	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
	atomic_inc(&nvmeq->dev->ctrl.abort_limit);

	blk_mq_free_request(req);
}

931
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
K
Keith Busch 已提交
932
{
C
Christoph Hellwig 已提交
933 934
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = iod->nvmeq;
K
Keith Busch 已提交
935
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
936 937
	struct request *abort_req;
	struct nvme_command cmd;
K
Keith Busch 已提交
938

939
	/*
940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
	 * Shutdown immediately if controller times out while starting. The
	 * reset work will see the pci device disabled when it gets the forced
	 * cancellation error. All outstanding requests are completed on
	 * shutdown, so we return BLK_EH_HANDLED.
	 */
	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
		dev_warn(dev->dev,
			 "I/O %d QID %d timeout, disable controller\n",
			 req->tag, nvmeq->qid);
		nvme_dev_shutdown(dev);
		req->errors = NVME_SC_CANCELLED;
		return BLK_EH_HANDLED;
	}

	/*
 	 * Shutdown the controller immediately and schedule a reset if the
 	 * command was already aborted once before and still hasn't been
 	 * returned to the driver, or if this is the admin queue.
958
	 */
C
Christoph Hellwig 已提交
959
	if (!nvmeq->qid || iod->aborted) {
960 961 962 963 964 965 966 967 968 969 970 971
		dev_warn(dev->dev,
			 "I/O %d QID %d timeout, reset controller\n",
			 req->tag, nvmeq->qid);
		nvme_dev_shutdown(dev);
		queue_work(nvme_workq, &dev->reset_work);

		/*
		 * Mark the request as handled, since the inline shutdown
		 * forces all outstanding requests to complete.
		 */
		req->errors = NVME_SC_CANCELLED;
		return BLK_EH_HANDLED;
K
Keith Busch 已提交
972 973
	}

C
Christoph Hellwig 已提交
974
	iod->aborted = 1;
K
Keith Busch 已提交
975

976
	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
977
		atomic_inc(&dev->ctrl.abort_limit);
978
		return BLK_EH_RESET_TIMER;
979
	}
K
Keith Busch 已提交
980 981 982

	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
983
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
984 985
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);

986 987
	dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n",
				 req->tag, nvmeq->qid);
988 989 990 991 992 993 994 995 996 997 998

	abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd,
			BLK_MQ_REQ_NOWAIT);
	if (IS_ERR(abort_req)) {
		atomic_inc(&dev->ctrl.abort_limit);
		return BLK_EH_RESET_TIMER;
	}

	abort_req->timeout = ADMIN_TIMEOUT;
	abort_req->end_io_data = NULL;
	blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio);
999 1000 1001 1002 1003 1004 1005

	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
K
Keith Busch 已提交
1006 1007
}

1008
static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
1009
{
M
Matias Bjørling 已提交
1010
	struct nvme_queue *nvmeq = data;
1011
	int status;
K
Keith Busch 已提交
1012 1013 1014

	if (!blk_mq_request_started(req))
		return;
1015

1016 1017
	dev_warn(nvmeq->q_dmadev,
		 "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid);
M
Matias Bjørling 已提交
1018

1019
	status = NVME_SC_CANCELLED;
K
Keith Busch 已提交
1020
	if (blk_queue_dying(req->q))
1021 1022
		status |= NVME_SC_DNR;
	blk_mq_complete_request(req, status);
1023 1024
}

M
Matias Bjørling 已提交
1025 1026
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1027 1028
	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1029 1030
	if (nvmeq->sq_cmds)
		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1031 1032 1033 1034
					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
	kfree(nvmeq);
}

1035
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1036 1037 1038
{
	int i;

1039
	for (i = dev->queue_count - 1; i >= lowest; i--) {
M
Matias Bjørling 已提交
1040
		struct nvme_queue *nvmeq = dev->queues[i];
1041
		dev->queue_count--;
M
Matias Bjørling 已提交
1042
		dev->queues[i] = NULL;
K
Keith Busch 已提交
1043
		nvme_free_queue(nvmeq);
1044
	}
1045 1046
}

K
Keith Busch 已提交
1047 1048 1049 1050 1051
/**
 * nvme_suspend_queue - put queue into suspended state
 * @nvmeq - queue to suspend
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1052
{
K
Keith Busch 已提交
1053
	int vector;
M
Matthew Wilcox 已提交
1054

1055
	spin_lock_irq(&nvmeq->q_lock);
K
Keith Busch 已提交
1056 1057 1058 1059 1060
	if (nvmeq->cq_vector == -1) {
		spin_unlock_irq(&nvmeq->q_lock);
		return 1;
	}
	vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
K
Keith Busch 已提交
1061
	nvmeq->dev->online_queues--;
K
Keith Busch 已提交
1062
	nvmeq->cq_vector = -1;
1063 1064
	spin_unlock_irq(&nvmeq->q_lock);

1065 1066
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
		blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q);
1067

M
Matthew Wilcox 已提交
1068 1069
	irq_set_affinity_hint(vector, NULL);
	free_irq(vector, nvmeq);
M
Matthew Wilcox 已提交
1070

K
Keith Busch 已提交
1071 1072
	return 0;
}
M
Matthew Wilcox 已提交
1073

K
Keith Busch 已提交
1074 1075
static void nvme_clear_queue(struct nvme_queue *nvmeq)
{
1076
	spin_lock_irq(&nvmeq->q_lock);
1077 1078
	if (nvmeq->tags && *nvmeq->tags)
		blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
1079
	spin_unlock_irq(&nvmeq->q_lock);
M
Matthew Wilcox 已提交
1080 1081
}

K
Keith Busch 已提交
1082 1083
static void nvme_disable_queue(struct nvme_dev *dev, int qid)
{
M
Matias Bjørling 已提交
1084
	struct nvme_queue *nvmeq = dev->queues[qid];
K
Keith Busch 已提交
1085 1086 1087 1088 1089 1090

	if (!nvmeq)
		return;
	if (nvme_suspend_queue(nvmeq))
		return;

K
Keith Busch 已提交
1091 1092
	/* Don't tell the adapter to delete the admin queue.
	 * Don't tell a removed adapter to delete IO queues. */
1093
	if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) {
M
Matthew Wilcox 已提交
1094 1095 1096
		adapter_delete_sq(dev, qid);
		adapter_delete_cq(dev, qid);
	}
1097 1098 1099 1100

	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
M
Matthew Wilcox 已提交
1101 1102
}

1103 1104 1105 1106
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1107 1108
	unsigned q_size_aligned = roundup(q_depth * entry_size,
					  dev->ctrl.page_size);
1109 1110

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1111
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1112
		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
1113
		q_depth = div_u64(mem_per_q, entry_size);
1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
				int qid, int depth)
{
	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
1131 1132
		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
						      dev->ctrl.page_size);
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
		nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
		nvmeq->sq_cmds_io = dev->cmb + offset;
	} else {
		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
					&nvmeq->sq_dma_addr, GFP_KERNEL);
		if (!nvmeq->sq_cmds)
			return -ENOMEM;
	}

	return 0;
}

M
Matthew Wilcox 已提交
1145
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
K
Keith Busch 已提交
1146
							int depth)
M
Matthew Wilcox 已提交
1147
{
M
Matias Bjørling 已提交
1148
	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
M
Matthew Wilcox 已提交
1149 1150 1151
	if (!nvmeq)
		return NULL;

1152
	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
J
Joe Perches 已提交
1153
					  &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1154 1155 1156
	if (!nvmeq->cqes)
		goto free_nvmeq;

1157
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
M
Matthew Wilcox 已提交
1158 1159
		goto free_cqdma;

1160
	nvmeq->q_dmadev = dev->dev;
M
Matthew Wilcox 已提交
1161
	nvmeq->dev = dev;
1162
	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
1163
			dev->ctrl.instance, qid);
M
Matthew Wilcox 已提交
1164 1165
	spin_lock_init(&nvmeq->q_lock);
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1166
	nvmeq->cq_phase = 1;
1167
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
M
Matthew Wilcox 已提交
1168
	nvmeq->q_depth = depth;
K
Keith Busch 已提交
1169
	nvmeq->qid = qid;
1170
	nvmeq->cq_vector = -1;
M
Matias Bjørling 已提交
1171
	dev->queues[qid] = nvmeq;
M
Matthew Wilcox 已提交
1172

1173 1174 1175 1176
	/* make sure queue descriptor is set before queue count, for kthread */
	mb();
	dev->queue_count++;

M
Matthew Wilcox 已提交
1177 1178 1179
	return nvmeq;

 free_cqdma:
1180
	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
M
Matthew Wilcox 已提交
1181 1182 1183 1184 1185 1186
							nvmeq->cq_dma_addr);
 free_nvmeq:
	kfree(nvmeq);
	return NULL;
}

1187 1188 1189
static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
							const char *name)
{
1190 1191
	if (use_threaded_interrupts)
		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
1192
					nvme_irq_check, nvme_irq, IRQF_SHARED,
1193
					name, nvmeq);
1194
	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
1195
				IRQF_SHARED, name, nvmeq);
1196 1197
}

1198
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1199
{
1200
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1201

1202
	spin_lock_irq(&nvmeq->q_lock);
1203 1204 1205
	nvmeq->sq_tail = 0;
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1206
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1207
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
K
Keith Busch 已提交
1208
	dev->online_queues++;
1209
	spin_unlock_irq(&nvmeq->q_lock);
1210 1211 1212 1213 1214 1215
}

static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1216

K
Keith Busch 已提交
1217
	nvmeq->cq_vector = qid - 1;
M
Matthew Wilcox 已提交
1218 1219
	result = adapter_alloc_cq(dev, qid, nvmeq);
	if (result < 0)
1220
		return result;
M
Matthew Wilcox 已提交
1221 1222 1223 1224 1225

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
		goto release_cq;

1226
	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
M
Matthew Wilcox 已提交
1227 1228 1229
	if (result < 0)
		goto release_sq;

1230 1231
	nvme_init_queue(nvmeq, qid);
	return result;
M
Matthew Wilcox 已提交
1232 1233 1234 1235 1236

 release_sq:
	adapter_delete_sq(dev, qid);
 release_cq:
	adapter_delete_cq(dev, qid);
1237
	return result;
M
Matthew Wilcox 已提交
1238 1239
}

M
Matias Bjørling 已提交
1240
static struct blk_mq_ops nvme_mq_admin_ops = {
1241
	.queue_rq	= nvme_queue_rq,
1242
	.complete	= nvme_complete_rq,
M
Matias Bjørling 已提交
1243 1244
	.map_queue	= blk_mq_map_queue,
	.init_hctx	= nvme_admin_init_hctx,
1245
	.exit_hctx      = nvme_admin_exit_hctx,
M
Matias Bjørling 已提交
1246 1247 1248 1249 1250 1251
	.init_request	= nvme_admin_init_request,
	.timeout	= nvme_timeout,
};

static struct blk_mq_ops nvme_mq_ops = {
	.queue_rq	= nvme_queue_rq,
1252
	.complete	= nvme_complete_rq,
M
Matias Bjørling 已提交
1253 1254 1255 1256
	.map_queue	= blk_mq_map_queue,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.timeout	= nvme_timeout,
J
Jens Axboe 已提交
1257
	.poll		= nvme_poll,
M
Matias Bjørling 已提交
1258 1259
};

1260 1261
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1262 1263
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
		blk_cleanup_queue(dev->ctrl.admin_q);
1264 1265 1266 1267
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1268 1269
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1270
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1271 1272
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
1273
		dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH;
M
Matias Bjørling 已提交
1274
		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1275
		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1276
		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
M
Matias Bjørling 已提交
1277 1278 1279 1280 1281
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;

1282 1283
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1284 1285 1286
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1287
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1288
			nvme_dev_remove_admin(dev);
1289
			dev->ctrl.admin_q = NULL;
1290 1291
			return -ENODEV;
		}
K
Keith Busch 已提交
1292
	} else
1293
		blk_mq_unfreeze_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1294 1295 1296 1297

	return 0;
}

1298
static int nvme_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1299
{
1300
	int result;
M
Matthew Wilcox 已提交
1301
	u32 aqa;
1302
	u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
M
Matthew Wilcox 已提交
1303 1304
	struct nvme_queue *nvmeq;

1305
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
1306 1307
						NVME_CAP_NSSRC(cap) : 0;

1308 1309 1310
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1311

1312
	result = nvme_disable_ctrl(&dev->ctrl, cap);
1313 1314
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1315

M
Matias Bjørling 已提交
1316
	nvmeq = dev->queues[0];
1317
	if (!nvmeq) {
K
Keith Busch 已提交
1318
		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1319 1320 1321
		if (!nvmeq)
			return -ENOMEM;
	}
M
Matthew Wilcox 已提交
1322 1323 1324 1325

	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1326 1327 1328
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1329

1330
	result = nvme_enable_ctrl(&dev->ctrl, cap);
1331
	if (result)
M
Matias Bjørling 已提交
1332 1333
		goto free_nvmeq;

K
Keith Busch 已提交
1334
	nvmeq->cq_vector = 0;
1335
	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1336 1337
	if (result) {
		nvmeq->cq_vector = -1;
K
Keith Busch 已提交
1338
		goto free_nvmeq;
1339
	}
1340

M
Matthew Wilcox 已提交
1341
	return result;
M
Matias Bjørling 已提交
1342 1343 1344 1345

 free_nvmeq:
	nvme_free_queues(dev, 0);
	return result;
M
Matthew Wilcox 已提交
1346 1347
}

1348 1349
static int nvme_kthread(void *data)
{
1350
	struct nvme_dev *dev, *next;
1351 1352

	while (!kthread_should_stop()) {
1353
		set_current_state(TASK_INTERRUPTIBLE);
1354
		spin_lock(&dev_list_lock);
1355
		list_for_each_entry_safe(dev, next, &dev_list, node) {
1356
			int i;
1357
			u32 csts = readl(dev->bar + NVME_REG_CSTS);
1358

C
Christoph Hellwig 已提交
1359 1360 1361 1362 1363 1364
			/*
			 * Skip controllers currently under reset.
			 */
			if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work))
				continue;

1365 1366
			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
							csts & NVME_CSTS_CFS) {
C
Christoph Hellwig 已提交
1367
				if (queue_work(nvme_workq, &dev->reset_work)) {
1368 1369
					dev_warn(dev->dev,
						"Failed status: %x, reset controller\n",
1370
						readl(dev->bar + NVME_REG_CSTS));
1371
				}
1372 1373
				continue;
			}
1374
			for (i = 0; i < dev->queue_count; i++) {
M
Matias Bjørling 已提交
1375
				struct nvme_queue *nvmeq = dev->queues[i];
1376 1377
				if (!nvmeq)
					continue;
1378
				spin_lock_irq(&nvmeq->q_lock);
1379
				nvme_process_cq(nvmeq);
K
Keith Busch 已提交
1380

1381 1382
				while (i == 0 && dev->ctrl.event_limit > 0)
					nvme_submit_async_event(dev);
1383 1384 1385 1386
				spin_unlock_irq(&nvmeq->q_lock);
			}
		}
		spin_unlock(&dev_list_lock);
1387
		schedule_timeout(round_jiffies_relative(HZ));
1388 1389 1390 1391
	}
	return 0;
}

1392
static int nvme_create_io_queues(struct nvme_dev *dev)
K
Keith Busch 已提交
1393
{
M
Matias Bjørling 已提交
1394
	unsigned i;
1395
	int ret = 0;
K
Keith Busch 已提交
1396

1397 1398 1399
	for (i = dev->queue_count; i <= dev->max_qid; i++) {
		if (!nvme_alloc_queue(dev, i, dev->q_depth)) {
			ret = -ENOMEM;
K
Keith Busch 已提交
1400
			break;
1401 1402
		}
	}
K
Keith Busch 已提交
1403

1404 1405 1406
	for (i = dev->online_queues; i <= dev->queue_count - 1; i++) {
		ret = nvme_create_queue(dev->queues[i], i);
		if (ret) {
1407
			nvme_free_queues(dev, i);
K
Keith Busch 已提交
1408
			break;
1409
		}
1410 1411 1412 1413 1414 1415 1416 1417 1418
	}

	/*
	 * Ignore failing Create SQ/CQ commands, we can continue with less
	 * than the desired aount of queues, and even a controller without
	 * I/O queues an still be used to issue admin commands.  This might
	 * be useful to upgrade a buggy firmware for example.
	 */
	return ret >= 0 ? 0 : ret;
K
Keith Busch 已提交
1419 1420
}

1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432
static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
{
	u64 szu, size, offset;
	u32 cmbloc;
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	void __iomem *cmb;
	dma_addr_t dma_addr;

	if (!use_cmb_sqes)
		return NULL;

1433
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1434 1435 1436
	if (!(NVME_CMB_SZ(dev->cmbsz)))
		return NULL;

1437
	cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472

	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
	size = szu * NVME_CMB_SZ(dev->cmbsz);
	offset = szu * NVME_CMB_OFST(cmbloc);
	bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));

	if (offset > bar_size)
		return NULL;

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

	dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
	cmb = ioremap_wc(dma_addr, size);
	if (!cmb)
		return NULL;

	dev->cmb_dma_addr = dma_addr;
	dev->cmb_size = size;
	return cmb;
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
	if (dev->cmb) {
		iounmap(dev->cmb);
		dev->cmb = NULL;
	}
}

K
Keith Busch 已提交
1473 1474
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
1475
	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
K
Keith Busch 已提交
1476 1477
}

1478
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1479
{
M
Matias Bjørling 已提交
1480
	struct nvme_queue *adminq = dev->queues[0];
1481
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
1482
	int result, i, vecs, nr_io_queues, size;
M
Matthew Wilcox 已提交
1483

K
Keith Busch 已提交
1484
	nr_io_queues = num_possible_cpus();
C
Christoph Hellwig 已提交
1485 1486
	result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
	if (result < 0)
M
Matthew Wilcox 已提交
1487
		return result;
C
Christoph Hellwig 已提交
1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498

	/*
	 * Degraded controllers might return an error when setting the queue
	 * count.  We still want to be able to bring them online and offer
	 * access to the admin queue, as that might be only way to fix them up.
	 */
	if (result > 0) {
		dev_err(dev->dev, "Could not set queue count (%d)\n", result);
		nr_io_queues = 0;
		result = 0;
	}
M
Matthew Wilcox 已提交
1499

1500 1501 1502 1503 1504 1505 1506 1507 1508
	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
			nvme_release_cmb(dev);
	}

K
Keith Busch 已提交
1509 1510
	size = db_bar_size(dev, nr_io_queues);
	if (size > 8192) {
1511
		iounmap(dev->bar);
K
Keith Busch 已提交
1512 1513 1514 1515 1516 1517 1518 1519
		do {
			dev->bar = ioremap(pci_resource_start(pdev, 0), size);
			if (dev->bar)
				break;
			if (!--nr_io_queues)
				return -ENOMEM;
			size = db_bar_size(dev, nr_io_queues);
		} while (1);
1520
		dev->dbs = dev->bar + 4096;
1521
		adminq->q_db = dev->dbs;
1522 1523
	}

K
Keith Busch 已提交
1524
	/* Deregister the admin queue's interrupt */
1525
	free_irq(dev->entry[0].vector, adminq);
K
Keith Busch 已提交
1526

1527 1528 1529 1530 1531 1532 1533
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
	if (!pdev->irq)
		pci_disable_msix(pdev);

1534
	for (i = 0; i < nr_io_queues; i++)
M
Matthew Wilcox 已提交
1535
		dev->entry[i].entry = i;
1536 1537 1538 1539 1540 1541 1542 1543
	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
	if (vecs < 0) {
		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
		if (vecs < 0) {
			vecs = 1;
		} else {
			for (i = 0; i < vecs; i++)
				dev->entry[i].vector = i + pdev->irq;
R
Ramachandra Rao Gajula 已提交
1544 1545 1546
		}
	}

1547 1548 1549 1550 1551 1552 1553
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
	nr_io_queues = vecs;
K
Keith Busch 已提交
1554
	dev->max_qid = nr_io_queues;
1555

1556
	result = queue_request_irq(dev, adminq, adminq->irqname);
1557 1558
	if (result) {
		adminq->cq_vector = -1;
1559
		goto free_queues;
1560
	}
M
Matthew Wilcox 已提交
1561

1562
	/* Free previously allocated queues that are no longer usable */
K
Keith Busch 已提交
1563
	nvme_free_queues(dev, nr_io_queues + 1);
1564
	return nvme_create_io_queues(dev);
M
Matthew Wilcox 已提交
1565

1566
 free_queues:
1567
	nvme_free_queues(dev, 1);
1568
	return result;
M
Matthew Wilcox 已提交
1569 1570
}

1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586
static void nvme_set_irq_hints(struct nvme_dev *dev)
{
	struct nvme_queue *nvmeq;
	int i;

	for (i = 0; i < dev->online_queues; i++) {
		nvmeq = dev->queues[i];

		if (!nvmeq->tags || !(*nvmeq->tags))
			continue;

		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
					blk_mq_tags_cpumask(*nvmeq->tags));
	}
}

K
Keith Busch 已提交
1587 1588 1589 1590 1591 1592
static void nvme_dev_scan(struct work_struct *work)
{
	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);

	if (!dev->tagset.tags)
		return;
1593
	nvme_scan_namespaces(&dev->ctrl);
1594
	nvme_set_irq_hints(dev);
K
Keith Busch 已提交
1595 1596
}

1597 1598 1599 1600 1601 1602
/*
 * Return: error value if an error occurred setting up the queues or calling
 * Identify Device.  0 if these succeeded, even if adding some of the
 * namespaces failed.  At the moment, these failures are silent.  TBD which
 * failures should be reported.
 */
1603
static int nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1604
{
1605
	if (!dev->ctrl.tagset) {
1606 1607 1608 1609 1610
		dev->tagset.ops = &nvme_mq_ops;
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
		dev->tagset.timeout = NVME_IO_TIMEOUT;
		dev->tagset.numa_node = dev_to_node(dev->dev);
		dev->tagset.queue_depth =
M
Matias Bjørling 已提交
1611
				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
1612 1613 1614
		dev->tagset.cmd_size = nvme_cmd_size(dev);
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
1615

1616 1617
		if (blk_mq_alloc_tag_set(&dev->tagset))
			return 0;
1618
		dev->ctrl.tagset = &dev->tagset;
1619
	}
1620
	queue_work(nvme_workq, &dev->scan_work);
K
Keith Busch 已提交
1621
	return 0;
M
Matthew Wilcox 已提交
1622 1623
}

1624 1625
static int nvme_dev_map(struct nvme_dev *dev)
{
K
Keith Busch 已提交
1626
	u64 cap;
1627
	int bars, result = -ENOMEM;
1628
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1629 1630 1631 1632 1633 1634 1635

	if (pci_enable_device_mem(pdev))
		return result;

	dev->entry[0].vector = pdev->irq;
	pci_set_master(pdev);
	bars = pci_select_bars(pdev, IORESOURCE_MEM);
1636 1637 1638
	if (!bars)
		goto disable_pci;

1639 1640 1641
	if (pci_request_selected_regions(pdev, bars, "nvme"))
		goto disable_pci;

1642 1643
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
1644
		goto disable;
1645 1646 1647 1648

	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
	if (!dev->bar)
		goto disable;
1649

1650
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
1651 1652 1653
		result = -ENODEV;
		goto unmap;
	}
1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664

	/*
	 * Some devices don't advertse INTx interrupts, pre-enable a single
	 * MSIX vec for setup. We'll adjust this later.
	 */
	if (!pdev->irq) {
		result = pci_enable_msix(pdev, dev->entry, 1);
		if (result < 0)
			goto unmap;
	}

1665 1666
	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);

K
Keith Busch 已提交
1667 1668
	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
1669 1670
	dev->dbs = dev->bar + 4096;
	if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
1671
		dev->cmb = nvme_map_cmb(dev);
1672 1673 1674

	return 0;

K
Keith Busch 已提交
1675 1676 1677
 unmap:
	iounmap(dev->bar);
	dev->bar = NULL;
1678 1679 1680 1681 1682 1683 1684 1685 1686
 disable:
	pci_release_regions(pdev);
 disable_pci:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
{
1687 1688 1689 1690 1691 1692
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (pdev->msi_enabled)
		pci_disable_msi(pdev);
	else if (pdev->msix_enabled)
		pci_disable_msix(pdev);
1693 1694 1695 1696

	if (dev->bar) {
		iounmap(dev->bar);
		dev->bar = NULL;
1697
		pci_release_regions(pdev);
1698 1699
	}

1700 1701
	if (pci_is_enabled(pdev))
		pci_disable_device(pdev);
1702 1703
}

K
Keith Busch 已提交
1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720
struct nvme_delq_ctx {
	struct task_struct *waiter;
	struct kthread_worker *worker;
	atomic_t refcount;
};

static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
{
	dq->waiter = current;
	mb();

	for (;;) {
		set_current_state(TASK_KILLABLE);
		if (!atomic_read(&dq->refcount))
			break;
		if (!schedule_timeout(ADMIN_TIMEOUT) ||
					fatal_signal_pending(current)) {
K
Keith Busch 已提交
1721 1722 1723 1724 1725 1726 1727
			/*
			 * Disable the controller first since we can't trust it
			 * at this point, but leave the admin queue enabled
			 * until all queue deletion requests are flushed.
			 * FIXME: This may take a while if there are more h/w
			 * queues than admin tags.
			 */
K
Keith Busch 已提交
1728
			set_current_state(TASK_RUNNING);
1729
			nvme_disable_ctrl(&dev->ctrl,
1730
				lo_hi_readq(dev->bar + NVME_REG_CAP));
K
Keith Busch 已提交
1731
			nvme_clear_queue(dev->queues[0]);
K
Keith Busch 已提交
1732
			flush_kthread_worker(dq->worker);
K
Keith Busch 已提交
1733
			nvme_disable_queue(dev, 0);
K
Keith Busch 已提交
1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
			return;
		}
	}
	set_current_state(TASK_RUNNING);
}

static void nvme_put_dq(struct nvme_delq_ctx *dq)
{
	atomic_dec(&dq->refcount);
	if (dq->waiter)
		wake_up_process(dq->waiter);
}

static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
{
	atomic_inc(&dq->refcount);
	return dq;
}

static void nvme_del_queue_end(struct nvme_queue *nvmeq)
{
	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
	nvme_put_dq(dq);
1757 1758 1759 1760

	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
K
Keith Busch 已提交
1761 1762 1763 1764 1765
}

static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
						kthread_work_func_t fn)
{
1766
	struct request *req;
K
Keith Busch 已提交
1767 1768 1769 1770 1771 1772 1773
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);

	init_kthread_work(&nvmeq->cmdinfo.work, fn);
1774 1775 1776 1777 1778 1779 1780 1781 1782

	req = nvme_alloc_request(nvmeq->dev->ctrl.admin_q, &c, 0);
	if (IS_ERR(req))
		return PTR_ERR(req);

	req->timeout = ADMIN_TIMEOUT;
	req->end_io_data = &nvmeq->cmdinfo;
	blk_execute_rq_nowait(req->q, NULL, req, 0, async_cmd_info_endio);
	return 0;
K
Keith Busch 已提交
1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
}

static void nvme_del_cq_work_handler(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	nvme_del_queue_end(nvmeq);
}

static int nvme_delete_cq(struct nvme_queue *nvmeq)
{
	return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
						nvme_del_cq_work_handler);
}

static void nvme_del_sq_work_handler(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	int status = nvmeq->cmdinfo.status;

	if (!status)
		status = nvme_delete_cq(nvmeq);
	if (status)
		nvme_del_queue_end(nvmeq);
}

static int nvme_delete_sq(struct nvme_queue *nvmeq)
{
	return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
						nvme_del_sq_work_handler);
}

static void nvme_del_queue_start(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	if (nvme_delete_sq(nvmeq))
		nvme_del_queue_end(nvmeq);
}

static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	int i;
	DEFINE_KTHREAD_WORKER_ONSTACK(worker);
	struct nvme_delq_ctx dq;
	struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
1830
					&worker, "nvme%d", dev->ctrl.instance);
K
Keith Busch 已提交
1831 1832

	if (IS_ERR(kworker_task)) {
1833
		dev_err(dev->dev,
K
Keith Busch 已提交
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843
			"Failed to create queue del task\n");
		for (i = dev->queue_count - 1; i > 0; i--)
			nvme_disable_queue(dev, i);
		return;
	}

	dq.waiter = NULL;
	atomic_set(&dq.refcount, 0);
	dq.worker = &worker;
	for (i = dev->queue_count - 1; i > 0; i--) {
M
Matias Bjørling 已提交
1844
		struct nvme_queue *nvmeq = dev->queues[i];
K
Keith Busch 已提交
1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856

		if (nvme_suspend_queue(nvmeq))
			continue;
		nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
		nvmeq->cmdinfo.worker = dq.worker;
		init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
		queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
	}
	nvme_wait_dq(&dq, dev);
	kthread_stop(kworker_task);
}

1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880
static int nvme_dev_list_add(struct nvme_dev *dev)
{
	bool start_thread = false;

	spin_lock(&dev_list_lock);
	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
		start_thread = true;
		nvme_thread = NULL;
	}
	list_add(&dev->node, &dev_list);
	spin_unlock(&dev_list_lock);

	if (start_thread) {
		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
		wake_up_all(&nvme_kthread_wait);
	} else
		wait_event_killable(nvme_kthread_wait, nvme_thread);

	if (IS_ERR_OR_NULL(nvme_thread))
		return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;

	return 0;
}

1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
/*
* Remove the node from the device list and check
* for whether or not we need to stop the nvme_thread.
*/
static void nvme_dev_list_remove(struct nvme_dev *dev)
{
	struct task_struct *tmp = NULL;

	spin_lock(&dev_list_lock);
	list_del_init(&dev->node);
	if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
		tmp = nvme_thread;
		nvme_thread = NULL;
	}
	spin_unlock(&dev_list_lock);

	if (tmp)
		kthread_stop(tmp);
}

1901 1902 1903 1904
static void nvme_freeze_queues(struct nvme_dev *dev)
{
	struct nvme_ns *ns;

1905
	list_for_each_entry(ns, &dev->ctrl.namespaces, list) {
1906 1907
		blk_mq_freeze_queue_start(ns->queue);

1908
		spin_lock_irq(ns->queue->queue_lock);
1909
		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
1910
		spin_unlock_irq(ns->queue->queue_lock);
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920

		blk_mq_cancel_requeue_work(ns->queue);
		blk_mq_stop_hw_queues(ns->queue);
	}
}

static void nvme_unfreeze_queues(struct nvme_dev *dev)
{
	struct nvme_ns *ns;

1921
	list_for_each_entry(ns, &dev->ctrl.namespaces, list) {
1922 1923 1924 1925 1926 1927 1928
		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
		blk_mq_unfreeze_queue(ns->queue);
		blk_mq_start_stopped_hw_queues(ns->queue, true);
		blk_mq_kick_requeue_list(ns->queue);
	}
}

1929
static void nvme_dev_shutdown(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1930
{
1931
	int i;
1932
	u32 csts = -1;
1933

1934
	nvme_dev_list_remove(dev);
1935

1936
	mutex_lock(&dev->shutdown_lock);
1937 1938
	if (dev->bar) {
		nvme_freeze_queues(dev);
1939
		csts = readl(dev->bar + NVME_REG_CSTS);
1940
	}
1941
	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
K
Keith Busch 已提交
1942
		for (i = dev->queue_count - 1; i >= 0; i--) {
M
Matias Bjørling 已提交
1943
			struct nvme_queue *nvmeq = dev->queues[i];
K
Keith Busch 已提交
1944 1945 1946 1947
			nvme_suspend_queue(nvmeq);
		}
	} else {
		nvme_disable_io_queues(dev);
1948
		nvme_shutdown_ctrl(&dev->ctrl);
K
Keith Busch 已提交
1949 1950
		nvme_disable_queue(dev, 0);
	}
1951
	nvme_dev_unmap(dev);
1952 1953 1954

	for (i = dev->queue_count - 1; i >= 0; i--)
		nvme_clear_queue(dev->queues[i]);
1955
	mutex_unlock(&dev->shutdown_lock);
1956 1957
}

M
Matthew Wilcox 已提交
1958 1959
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
1960
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
M
Matthew Wilcox 已提交
1961 1962 1963 1964
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

1965
	/* Optimisation for I/Os between 4k and 128k */
1966
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
1967 1968 1969 1970 1971
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
1972 1973 1974 1975 1976 1977
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
1978
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
1979 1980
}

1981
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
1982
{
1983
	struct nvme_dev *dev = to_nvme_dev(ctrl);
1984

1985
	put_device(dev->dev);
1986 1987
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
1988 1989
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
1990 1991 1992 1993 1994
	kfree(dev->queues);
	kfree(dev->entry);
	kfree(dev);
}

1995
static void nvme_reset_work(struct work_struct *work)
1996
{
1997
	struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
1998
	int result;
1999

2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011
	if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
		goto out;

	/*
	 * If we're called to reset a live controller first shut it down before
	 * moving on.
	 */
	if (dev->bar)
		nvme_dev_shutdown(dev);

	set_bit(NVME_CTRL_RESETTING, &dev->flags);

2012 2013
	result = nvme_dev_map(dev);
	if (result)
2014
		goto out;
2015 2016 2017 2018 2019

	result = nvme_configure_admin_queue(dev);
	if (result)
		goto unmap;

M
Matias Bjørling 已提交
2020
	nvme_init_queue(dev->queues[0], 0);
K
Keith Busch 已提交
2021 2022 2023
	result = nvme_alloc_admin_tags(dev);
	if (result)
		goto disable;
2024

2025 2026 2027 2028
	result = nvme_init_identify(&dev->ctrl);
	if (result)
		goto free_tags;

2029
	result = nvme_setup_io_queues(dev);
2030
	if (result)
K
Keith Busch 已提交
2031
		goto free_tags;
2032

2033
	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
2034

2035 2036 2037 2038
	result = nvme_dev_list_add(dev);
	if (result)
		goto remove;

2039 2040 2041 2042
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2043 2044
	if (dev->online_queues < 2) {
		dev_warn(dev->dev, "IO queues not created\n");
2045
		nvme_remove_namespaces(&dev->ctrl);
2046 2047 2048 2049 2050
	} else {
		nvme_unfreeze_queues(dev);
		nvme_dev_add(dev);
	}

2051
	clear_bit(NVME_CTRL_RESETTING, &dev->flags);
2052
	return;
2053

2054 2055
 remove:
	nvme_dev_list_remove(dev);
K
Keith Busch 已提交
2056 2057
 free_tags:
	nvme_dev_remove_admin(dev);
2058 2059
	blk_put_queue(dev->ctrl.admin_q);
	dev->ctrl.admin_q = NULL;
2060
	dev->queues[0]->tags = NULL;
2061
 disable:
2062
	nvme_disable_queue(dev, 0);
2063 2064
 unmap:
	nvme_dev_unmap(dev);
2065
 out:
2066
	nvme_remove_dead_ctrl(dev);
2067 2068
}

2069
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
K
Keith Busch 已提交
2070
{
2071
	struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
2072
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2073 2074

	if (pci_get_drvdata(pdev))
2075
		pci_stop_and_remove_bus_device_locked(pdev);
2076
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2077 2078
}

2079
static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
2080
{
2081
	dev_warn(dev->dev, "Removing after probe failure\n");
2082
	kref_get(&dev->ctrl.kref);
2083
	if (!schedule_work(&dev->remove_work))
2084
		nvme_put_ctrl(&dev->ctrl);
2085 2086
}

2087 2088
static int nvme_reset(struct nvme_dev *dev)
{
2089
	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
2090 2091
		return -ENODEV;

C
Christoph Hellwig 已提交
2092 2093
	if (!queue_work(nvme_workq, &dev->reset_work))
		return -EBUSY;
2094

C
Christoph Hellwig 已提交
2095 2096
	flush_work(&dev->reset_work);
	return 0;
2097 2098
}

2099 2100 2101 2102 2103 2104
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
{
	*val = readl(to_nvme_dev(ctrl)->bar + off);
	return 0;
}

2105 2106 2107 2108 2109 2110
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
{
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}

2111 2112 2113 2114 2115 2116
static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
	*val = readq(to_nvme_dev(ctrl)->bar + off);
	return 0;
}

2117 2118 2119 2120 2121 2122 2123
static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl)
{
	struct nvme_dev *dev = to_nvme_dev(ctrl);

	return !dev->bar || dev->online_queues < 2;
}

2124 2125 2126 2127 2128
static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl)
{
	return nvme_reset(to_nvme_dev(ctrl));
}

2129 2130
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
	.reg_read32		= nvme_pci_reg_read32,
2131
	.reg_write32		= nvme_pci_reg_write32,
2132
	.reg_read64		= nvme_pci_reg_read64,
2133
	.io_incapable		= nvme_pci_io_incapable,
2134
	.reset_ctrl		= nvme_pci_reset_ctrl,
2135
	.free_ctrl		= nvme_pci_free_ctrl,
2136 2137
};

2138
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2139
{
M
Matias Bjørling 已提交
2140
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2141 2142
	struct nvme_dev *dev;

M
Matias Bjørling 已提交
2143 2144 2145 2146 2147
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
		set_dev_node(&pdev->dev, 0);

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2148 2149
	if (!dev)
		return -ENOMEM;
M
Matias Bjørling 已提交
2150 2151
	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
							GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2152 2153
	if (!dev->entry)
		goto free;
M
Matias Bjørling 已提交
2154 2155
	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
							GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2156 2157 2158
	if (!dev->queues)
		goto free;

2159
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2160
	pci_set_drvdata(pdev, dev);
2161

2162 2163 2164
	INIT_LIST_HEAD(&dev->node);
	INIT_WORK(&dev->scan_work, nvme_dev_scan);
	INIT_WORK(&dev->reset_work, nvme_reset_work);
2165
	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
2166
	mutex_init(&dev->shutdown_lock);
2167

2168
	result = nvme_setup_prp_pools(dev);
2169
	if (result)
K
Keith Busch 已提交
2170
		goto put_pci;
M
Matthew Wilcox 已提交
2171

2172 2173
	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
			id->driver_data);
M
Matthew Wilcox 已提交
2174
	if (result)
2175
		goto release_pools;
2176

2177
	queue_work(nvme_workq, &dev->reset_work);
M
Matthew Wilcox 已提交
2178 2179
	return 0;

2180
 release_pools:
M
Matthew Wilcox 已提交
2181
	nvme_release_prp_pools(dev);
K
Keith Busch 已提交
2182
 put_pci:
2183
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2184 2185 2186 2187 2188 2189 2190
 free:
	kfree(dev->queues);
	kfree(dev->entry);
	kfree(dev);
	return result;
}

2191 2192
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
{
K
Keith Busch 已提交
2193
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2194

K
Keith Busch 已提交
2195 2196 2197
	if (prepare)
		nvme_dev_shutdown(dev);
	else
2198
		queue_work(nvme_workq, &dev->reset_work);
2199 2200
}

2201 2202 2203 2204 2205 2206
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
	nvme_dev_shutdown(dev);
}

2207
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2208 2209
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2210 2211 2212 2213 2214 2215 2216

	spin_lock(&dev_list_lock);
	list_del_init(&dev->node);
	spin_unlock(&dev_list_lock);

	pci_set_drvdata(pdev, NULL);
	flush_work(&dev->reset_work);
K
Keith Busch 已提交
2217
	flush_work(&dev->scan_work);
2218
	nvme_remove_namespaces(&dev->ctrl);
2219
	nvme_uninit_ctrl(&dev->ctrl);
2220
	nvme_dev_shutdown(dev);
M
Matias Bjørling 已提交
2221
	nvme_dev_remove_admin(dev);
2222
	nvme_free_queues(dev, 0);
2223
	nvme_release_cmb(dev);
K
Keith Busch 已提交
2224
	nvme_release_prp_pools(dev);
2225
	nvme_put_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2226 2227 2228 2229 2230 2231 2232 2233
}

/* These functions are yet to be implemented */
#define nvme_error_detected NULL
#define nvme_dump_registers NULL
#define nvme_link_reset NULL
#define nvme_slot_reset NULL
#define nvme_error_resume NULL
2234

2235
#ifdef CONFIG_PM_SLEEP
2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

	nvme_dev_shutdown(ndev);
	return 0;
}

static int nvme_resume(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2250
	queue_work(nvme_workq, &ndev->reset_work);
K
Keith Busch 已提交
2251
	return 0;
2252
}
2253
#endif
2254 2255

static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
M
Matthew Wilcox 已提交
2256

2257
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
2258 2259 2260 2261 2262
	.error_detected	= nvme_error_detected,
	.mmio_enabled	= nvme_dump_registers,
	.link_reset	= nvme_link_reset,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
2263
	.reset_notify	= nvme_reset_notify,
M
Matthew Wilcox 已提交
2264 2265 2266 2267 2268
};

/* Move to pci_ids.h later */
#define PCI_CLASS_STORAGE_EXPRESS	0x010802

2269
static const struct pci_device_id nvme_id_table[] = {
2270 2271
	{ PCI_VDEVICE(INTEL, 0x0953),
		.driver_data = NVME_QUIRK_STRIPE_SIZE, },
2272 2273
	{ PCI_VDEVICE(INTEL, 0x5845),	/* Qemu emulated controller */
		.driver_data = NVME_QUIRK_IDENTIFY_CNS, },
M
Matthew Wilcox 已提交
2274
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
2275
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
M
Matthew Wilcox 已提交
2276 2277 2278 2279 2280 2281 2282 2283
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
2284
	.remove		= nvme_remove,
2285
	.shutdown	= nvme_shutdown,
2286 2287 2288
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
M
Matthew Wilcox 已提交
2289 2290 2291 2292 2293
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
2294
	int result;
2295

2296
	init_waitqueue_head(&nvme_kthread_wait);
M
Matthew Wilcox 已提交
2297

2298
	nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
K
Keith Busch 已提交
2299
	if (!nvme_workq)
2300
		return -ENOMEM;
K
Keith Busch 已提交
2301

2302
	result = nvme_core_init();
2303
	if (result < 0)
K
Keith Busch 已提交
2304
		goto kill_workq;
M
Matthew Wilcox 已提交
2305

2306 2307
	result = pci_register_driver(&nvme_driver);
	if (result)
2308
		goto core_exit;
2309
	return 0;
M
Matthew Wilcox 已提交
2310

2311
 core_exit:
2312
	nvme_core_exit();
K
Keith Busch 已提交
2313 2314
 kill_workq:
	destroy_workqueue(nvme_workq);
M
Matthew Wilcox 已提交
2315 2316 2317 2318 2319 2320
	return result;
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
2321
	nvme_core_exit();
K
Keith Busch 已提交
2322
	destroy_workqueue(nvme_workq);
2323
	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
2324
	_nvme_check_size();
M
Matthew Wilcox 已提交
2325 2326 2327 2328
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
2329
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
2330 2331
module_init(nvme_init);
module_exit(nvme_exit);