pci.c 69.9 KB
Newer Older
M
Matthew Wilcox 已提交
1 2
/*
 * NVM Express device driver
3
 * Copyright (c) 2011-2014, Intel Corporation.
M
Matthew Wilcox 已提交
4 5 6 7 8 9 10 11 12 13 14
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

15
#include <linux/bitops.h>
M
Matthew Wilcox 已提交
16
#include <linux/blkdev.h>
M
Matias Bjørling 已提交
17
#include <linux/blk-mq.h>
K
Keith Busch 已提交
18
#include <linux/cpu.h>
19
#include <linux/delay.h>
M
Matthew Wilcox 已提交
20 21 22
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/genhd.h>
K
Keith Busch 已提交
23
#include <linux/hdreg.h>
24
#include <linux/idr.h>
M
Matthew Wilcox 已提交
25 26 27 28
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kdev_t.h>
29
#include <linux/kthread.h>
M
Matthew Wilcox 已提交
30
#include <linux/kernel.h>
K
Keith Busch 已提交
31
#include <linux/list_sort.h>
M
Matthew Wilcox 已提交
32 33 34 35
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/pci.h>
36
#include <linux/poison.h>
37
#include <linux/ptrace.h>
M
Matthew Wilcox 已提交
38 39
#include <linux/sched.h>
#include <linux/slab.h>
K
Keith Busch 已提交
40
#include <linux/t10-pi.h>
M
Matthew Wilcox 已提交
41
#include <linux/types.h>
K
Keith Busch 已提交
42
#include <linux/pr.h>
V
Vishal Verma 已提交
43
#include <scsi/sg.h>
44
#include <linux/io-64-nonatomic-lo-hi.h>
K
Keith Busch 已提交
45
#include <asm/unaligned.h>
46

47
#include <uapi/linux/nvme_ioctl.h>
48 49
#include "nvme.h"

50
#define NVME_MINORS		(1U << MINORBITS)
51
#define NVME_Q_DEPTH		1024
J
Jens Axboe 已提交
52
#define NVME_AQ_DEPTH		256
M
Matthew Wilcox 已提交
53 54
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
55

56
unsigned char admin_timeout = 60;
57 58
module_param(admin_timeout, byte, 0644);
MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
M
Matthew Wilcox 已提交
59

60 61
unsigned char nvme_io_timeout = 30;
module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
62
MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
M
Matthew Wilcox 已提交
63

64
unsigned char shutdown_timeout = 5;
65 66 67
module_param(shutdown_timeout, byte, 0644);
MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");

M
Matthew Wilcox 已提交
68 69 70
static int nvme_major;
module_param(nvme_major, int, 0);

71 72 73
static int nvme_char_major;
module_param(nvme_char_major, int, 0);

74 75 76
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

77 78 79 80
static bool use_cmb_sqes = true;
module_param(use_cmb_sqes, bool, 0644);
MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");

81 82
static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
K
Keith Busch 已提交
83
static struct workqueue_struct *nvme_workq;
84
static wait_queue_head_t nvme_kthread_wait;
85

86 87
static struct class *nvme_class;

88 89
struct nvme_dev;
struct nvme_queue;
90
struct nvme_iod;
91

92
static int __nvme_reset(struct nvme_dev *dev);
93
static int nvme_reset(struct nvme_dev *dev);
J
Jens Axboe 已提交
94
static void nvme_process_cq(struct nvme_queue *nvmeq);
95
static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod);
96
static void nvme_dead_ctrl(struct nvme_dev *dev);
97

K
Keith Busch 已提交
98 99 100
struct async_cmd_info {
	struct kthread_work work;
	struct kthread_worker *worker;
M
Matias Bjørling 已提交
101
	struct request *req;
K
Keith Busch 已提交
102 103 104 105
	u32 result;
	int status;
	void *ctx;
};
106

107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
/*
 * Represents an NVM Express device.  Each nvme_dev is a PCI function.
 */
struct nvme_dev {
	struct list_head node;
	struct nvme_queue **queues;
	struct blk_mq_tag_set tagset;
	struct blk_mq_tag_set admin_tagset;
	u32 __iomem *dbs;
	struct device *dev;
	struct dma_pool *prp_page_pool;
	struct dma_pool *prp_small_pool;
	unsigned queue_count;
	unsigned online_queues;
	unsigned max_qid;
	int q_depth;
	u32 db_stride;
	struct msix_entry *entry;
	void __iomem *bar;
	struct list_head namespaces;
	struct device *device;
	struct work_struct reset_work;
	struct work_struct probe_work;
	struct work_struct scan_work;
	bool subsystem;
	u32 max_hw_sectors;
	u32 stripe_size;
	void __iomem *cmb;
	dma_addr_t cmb_dma_addr;
	u64 cmb_size;
	u32 cmbsz;

	struct nvme_ctrl ctrl;
};

static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
{
	return container_of(ctrl, struct nvme_dev, ctrl);
}

M
Matthew Wilcox 已提交
147 148 149 150 151 152
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
	struct device *q_dmadev;
M
Matthew Wilcox 已提交
153
	struct nvme_dev *dev;
154
	char irqname[24];	/* nvme4294967295-65535\0 */
M
Matthew Wilcox 已提交
155 156
	spinlock_t q_lock;
	struct nvme_command *sq_cmds;
157
	struct nvme_command __iomem *sq_cmds_io;
M
Matthew Wilcox 已提交
158
	volatile struct nvme_completion *cqes;
159
	struct blk_mq_tags **tags;
M
Matthew Wilcox 已提交
160 161 162 163
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	u32 __iomem *q_db;
	u16 q_depth;
J
Jens Axboe 已提交
164
	s16 cq_vector;
M
Matthew Wilcox 已提交
165 166 167
	u16 sq_head;
	u16 sq_tail;
	u16 cq_head;
K
Keith Busch 已提交
168
	u16 qid;
169 170
	u8 cq_phase;
	u8 cqe_seen;
K
Keith Busch 已提交
171
	struct async_cmd_info cmdinfo;
M
Matthew Wilcox 已提交
172 173
};

174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
/*
 * The nvme_iod describes the data in an I/O, including the list of PRP
 * entries.  You can't see it in this data structure because C doesn't let
 * me express that.  Use nvme_alloc_iod to ensure there's enough space
 * allocated to store the PRP list.
 */
struct nvme_iod {
	unsigned long private;	/* For the use of the submitter of the I/O */
	int npages;		/* In the PRP list. 0 means small pool in use */
	int offset;		/* Of PRP list */
	int nents;		/* Used in scatterlist */
	int length;		/* Of data, in bytes */
	dma_addr_t first_dma;
	struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */
	struct scatterlist sg[0];
};

M
Matthew Wilcox 已提交
191 192 193 194 195 196 197 198 199 200
/*
 * Check we didin't inadvertently grow the command struct
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
201
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
K
Keith Busch 已提交
202
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
M
Matthew Wilcox 已提交
203 204 205 206
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
K
Keith Busch 已提交
207
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
M
Matthew Wilcox 已提交
208 209
}

210
typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
211 212
						struct nvme_completion *);

213
struct nvme_cmd_info {
214 215
	nvme_completion_fn fn;
	void *ctx;
K
Keith Busch 已提交
216
	int aborted;
M
Matias Bjørling 已提交
217
	struct nvme_queue *nvmeq;
218
	struct nvme_iod iod[0];
219 220
};

221 222 223 224
/*
 * Max size of iod being embedded in the request payload
 */
#define NVME_INT_PAGES		2
225
#define NVME_INT_BYTES(dev)	(NVME_INT_PAGES * (dev)->ctrl.page_size)
C
Chong Yuan 已提交
226
#define NVME_INT_MASK		0x01
227 228 229 230 231 232 233 234

/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size, struct nvme_dev *dev)
{
235 236
	unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
				      dev->ctrl.page_size);
237 238 239 240 241 242 243 244 245 246 247 248 249 250
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}

static unsigned int nvme_cmd_size(struct nvme_dev *dev)
{
	unsigned int ret = sizeof(struct nvme_cmd_info);

	ret += sizeof(struct nvme_iod);
	ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
	ret += sizeof(struct scatterlist) * NVME_INT_PAGES;

	return ret;
}

M
Matias Bjørling 已提交
251 252
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
				unsigned int hctx_idx)
253
{
M
Matias Bjørling 已提交
254 255 256
	struct nvme_dev *dev = data;
	struct nvme_queue *nvmeq = dev->queues[0];

257 258 259 260
	WARN_ON(hctx_idx != 0);
	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
	WARN_ON(nvmeq->tags);

M
Matias Bjørling 已提交
261
	hctx->driver_data = nvmeq;
262
	nvmeq->tags = &dev->admin_tagset.tags[0];
M
Matias Bjørling 已提交
263
	return 0;
264 265
}

266 267 268 269 270 271 272
static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	nvmeq->tags = NULL;
}

M
Matias Bjørling 已提交
273 274 275
static int nvme_admin_init_request(void *data, struct request *req,
				unsigned int hctx_idx, unsigned int rq_idx,
				unsigned int numa_node)
276
{
M
Matias Bjørling 已提交
277 278 279 280 281 282 283
	struct nvme_dev *dev = data;
	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = dev->queues[0];

	BUG_ON(!nvmeq);
	cmd->nvmeq = nvmeq;
	return 0;
284 285
}

M
Matias Bjørling 已提交
286 287
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
			  unsigned int hctx_idx)
M
Matthew Wilcox 已提交
288
{
M
Matias Bjørling 已提交
289
	struct nvme_dev *dev = data;
290
	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
M
Matias Bjørling 已提交
291

292 293
	if (!nvmeq->tags)
		nvmeq->tags = &dev->tagset.tags[hctx_idx];
M
Matthew Wilcox 已提交
294

295
	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
M
Matias Bjørling 已提交
296 297
	hctx->driver_data = nvmeq;
	return 0;
M
Matthew Wilcox 已提交
298 299
}

M
Matias Bjørling 已提交
300 301 302
static int nvme_init_request(void *data, struct request *req,
				unsigned int hctx_idx, unsigned int rq_idx,
				unsigned int numa_node)
M
Matthew Wilcox 已提交
303
{
M
Matias Bjørling 已提交
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
	struct nvme_dev *dev = data;
	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];

	BUG_ON(!nvmeq);
	cmd->nvmeq = nvmeq;
	return 0;
}

static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
				nvme_completion_fn handler)
{
	cmd->fn = handler;
	cmd->ctx = ctx;
	cmd->aborted = 0;
K
Keith Busch 已提交
319
	blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
M
Matthew Wilcox 已提交
320 321
}

322 323 324 325 326 327 328 329 330 331
static void *iod_get_private(struct nvme_iod *iod)
{
	return (void *) (iod->private & ~0x1UL);
}

/*
 * If bit 0 is set, the iod is embedded in the request payload.
 */
static bool iod_should_kfree(struct nvme_iod *iod)
{
C
Chong Yuan 已提交
332
	return (iod->private & NVME_INT_MASK) == 0;
333 334
}

335 336
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
337 338 339
#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
340

341
static void special_completion(struct nvme_queue *nvmeq, void *ctx,
342 343 344 345 346
						struct nvme_completion *cqe)
{
	if (ctx == CMD_CTX_CANCELLED)
		return;
	if (ctx == CMD_CTX_COMPLETED) {
347
		dev_warn(nvmeq->q_dmadev,
348 349 350 351 352
				"completed id %d twice on queue %d\n",
				cqe->command_id, le16_to_cpup(&cqe->sq_id));
		return;
	}
	if (ctx == CMD_CTX_INVALID) {
353
		dev_warn(nvmeq->q_dmadev,
354 355 356 357
				"invalid id %d completed on queue %d\n",
				cqe->command_id, le16_to_cpup(&cqe->sq_id));
		return;
	}
358
	dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
359 360
}

M
Matias Bjørling 已提交
361
static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
M
Matthew Wilcox 已提交
362
{
363
	void *ctx;
M
Matthew Wilcox 已提交
364

365
	if (fn)
M
Matias Bjørling 已提交
366 367 368 369
		*fn = cmd->fn;
	ctx = cmd->ctx;
	cmd->fn = special_completion;
	cmd->ctx = CMD_CTX_CANCELLED;
370
	return ctx;
M
Matthew Wilcox 已提交
371 372
}

M
Matias Bjørling 已提交
373 374
static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
						struct nvme_completion *cqe)
375
{
M
Matias Bjørling 已提交
376 377 378 379
	u32 result = le32_to_cpup(&cqe->result);
	u16 status = le16_to_cpup(&cqe->status) >> 1;

	if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
380
		++nvmeq->dev->ctrl.event_limit;
K
Keith Busch 已提交
381 382 383 384 385 386 387 388 389 390
	if (status != NVME_SC_SUCCESS)
		return;

	switch (result & 0xff07) {
	case NVME_AER_NOTICE_NS_CHANGED:
		dev_info(nvmeq->q_dmadev, "rescanning\n");
		schedule_work(&nvmeq->dev->scan_work);
	default:
		dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result);
	}
M
Matthew Wilcox 已提交
391 392
}

M
Matias Bjørling 已提交
393 394
static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
						struct nvme_completion *cqe)
395
{
M
Matias Bjørling 已提交
396 397 398 399
	struct request *req = ctx;

	u16 status = le16_to_cpup(&cqe->status) >> 1;
	u32 result = le32_to_cpup(&cqe->result);
400

401
	blk_mq_free_request(req);
402

M
Matias Bjørling 已提交
403
	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
404
	++nvmeq->dev->ctrl.abort_limit;
405 406
}

M
Matias Bjørling 已提交
407 408
static void async_completion(struct nvme_queue *nvmeq, void *ctx,
						struct nvme_completion *cqe)
M
Matthew Wilcox 已提交
409
{
M
Matias Bjørling 已提交
410 411 412 413
	struct async_cmd_info *cmdinfo = ctx;
	cmdinfo->result = le32_to_cpup(&cqe->result);
	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
414
	blk_mq_free_request(cmdinfo->req);
M
Matthew Wilcox 已提交
415 416
}

M
Matias Bjørling 已提交
417 418
static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
				  unsigned int tag)
M
Matthew Wilcox 已提交
419
{
420
	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
421

M
Matias Bjørling 已提交
422
	return blk_mq_rq_to_pdu(req);
423 424
}

M
Matias Bjørling 已提交
425 426 427 428 429
/*
 * Called with local interrupts disabled and the q_lock held.  May not sleep.
 */
static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
						nvme_completion_fn *fn)
430
{
M
Matias Bjørling 已提交
431 432 433 434 435 436 437 438 439 440 441 442
	struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
	void *ctx;
	if (tag >= nvmeq->q_depth) {
		*fn = special_completion;
		return CMD_CTX_INVALID;
	}
	if (fn)
		*fn = cmd->fn;
	ctx = cmd->ctx;
	cmd->fn = special_completion;
	cmd->ctx = CMD_CTX_COMPLETED;
	return ctx;
M
Matthew Wilcox 已提交
443 444 445
}

/**
446
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
447 448 449 450 451
 * @nvmeq: The queue to use
 * @cmd: The command to send
 *
 * Safe to use from interrupt context
 */
452 453
static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
						struct nvme_command *cmd)
M
Matthew Wilcox 已提交
454
{
M
Matias Bjørling 已提交
455 456
	u16 tail = nvmeq->sq_tail;

457 458 459 460 461
	if (nvmeq->sq_cmds_io)
		memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
	else
		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));

M
Matthew Wilcox 已提交
462 463
	if (++tail == nvmeq->q_depth)
		tail = 0;
464
	writel(tail, nvmeq->q_db);
M
Matthew Wilcox 已提交
465 466 467
	nvmeq->sq_tail = tail;
}

468
static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
M
Matias Bjørling 已提交
469 470 471
{
	unsigned long flags;
	spin_lock_irqsave(&nvmeq->q_lock, flags);
472
	__nvme_submit_cmd(nvmeq, cmd);
M
Matias Bjørling 已提交
473 474 475
	spin_unlock_irqrestore(&nvmeq->q_lock, flags);
}

476
static __le64 **iod_list(struct nvme_iod *iod)
477
{
478
	return ((void *)iod) + iod->offset;
479 480
}

481 482
static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
			    unsigned nseg, unsigned long private)
483
{
484 485 486 487 488
	iod->private = private;
	iod->offset = offsetof(struct nvme_iod, sg[nseg]);
	iod->npages = -1;
	iod->length = nbytes;
	iod->nents = 0;
489
}
M
Matthew Wilcox 已提交
490

491
static struct nvme_iod *
492 493
__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
		 unsigned long priv, gfp_t gfp)
M
Matthew Wilcox 已提交
494
{
495
	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
496
				sizeof(__le64 *) * nvme_npages(bytes, dev) +
497 498
				sizeof(struct scatterlist) * nseg, gfp);

499 500
	if (iod)
		iod_init(iod, bytes, nseg, priv);
501 502

	return iod;
M
Matthew Wilcox 已提交
503 504
}

505 506 507 508 509 510 511 512 513 514 515 516 517
static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
			               gfp_t gfp)
{
	unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
                                                sizeof(struct nvme_dsm_range);
	struct nvme_iod *iod;

	if (rq->nr_phys_segments <= NVME_INT_PAGES &&
	    size <= NVME_INT_BYTES(dev)) {
		struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);

		iod = cmd->iod;
		iod_init(iod, size, rq->nr_phys_segments,
C
Chong Yuan 已提交
518
				(unsigned long) rq | NVME_INT_MASK);
519 520 521 522 523 524 525
		return iod;
	}

	return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
				(unsigned long) rq, gfp);
}

526
static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
M
Matthew Wilcox 已提交
527
{
528
	const int last_prp = dev->ctrl.page_size / 8 - 1;
529 530 531 532 533 534 535 536 537 538 539 540
	int i;
	__le64 **list = iod_list(iod);
	dma_addr_t prp_dma = iod->first_dma;

	if (iod->npages == 0)
		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
	for (i = 0; i < iod->npages; i++) {
		__le64 *prp_list = list[i];
		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
		prp_dma = next_prp_dma;
	}
541 542 543

	if (iod_should_kfree(iod))
		kfree(iod);
M
Matthew Wilcox 已提交
544 545
}

546
#ifdef CONFIG_BLK_DEV_INTEGRITY
K
Keith Busch 已提交
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
	if (be32_to_cpu(pi->ref_tag) == v)
		pi->ref_tag = cpu_to_be32(p);
}

static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
	if (be32_to_cpu(pi->ref_tag) == p)
		pi->ref_tag = cpu_to_be32(v);
}

/**
 * nvme_dif_remap - remaps ref tags to bip seed and physical lba
 *
 * The virtual start sector is the one that was originally submitted by the
 * block layer.	Due to partitioning, MD/DM cloning, etc. the actual physical
 * start sector may be different. Remap protection information to match the
 * physical LBA on writes, and back to the original seed on reads.
 *
 * Type 0 and 3 do not have a ref tag, so no remapping required.
 */
static void nvme_dif_remap(struct request *req,
			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
	struct nvme_ns *ns = req->rq_disk->private_data;
	struct bio_integrity_payload *bip;
	struct t10_pi_tuple *pi;
	void *p, *pmap;
	u32 i, nlb, ts, phys, virt;

	if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
		return;

	bip = bio_integrity(req->bio);
	if (!bip)
		return;

	pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;

	p = pmap;
	virt = bip_get_seed(bip);
	phys = nvme_block_nr(ns, blk_rq_pos(req));
	nlb = (blk_rq_bytes(req) >> ns->lba_shift);
591
	ts = ns->disk->queue->integrity.tuple_size;
K
Keith Busch 已提交
592 593 594 595 596 597 598 599

	for (i = 0; i < nlb; i++, virt++, phys++) {
		pi = (struct t10_pi_tuple *)p;
		dif_swap(phys, virt, pi);
		p += ts;
	}
	kunmap_atomic(pmap);
}
600 601 602 603 604 605 606 607 608 609 610 611 612
#else /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_dif_remap(struct request *req,
			void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
{
}
static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
{
}
#endif

M
Matias Bjørling 已提交
613
static void req_completion(struct nvme_queue *nvmeq, void *ctx,
M
Matthew Wilcox 已提交
614 615
						struct nvme_completion *cqe)
{
616
	struct nvme_iod *iod = ctx;
617
	struct request *req = iod_get_private(iod);
M
Matias Bjørling 已提交
618
	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
M
Matthew Wilcox 已提交
619
	u16 status = le16_to_cpup(&cqe->status) >> 1;
620
	int error = 0;
M
Matthew Wilcox 已提交
621

622
	if (unlikely(status)) {
M
Matias Bjørling 已提交
623 624
		if (!(status & NVME_SC_DNR || blk_noretry_request(req))
		    && (jiffies - req->start_time) < req->timeout) {
625 626
			unsigned long flags;

627 628
			nvme_unmap_data(nvmeq->dev, iod);

M
Matias Bjørling 已提交
629
			blk_mq_requeue_request(req);
630 631 632 633
			spin_lock_irqsave(req->q->queue_lock, flags);
			if (!blk_queue_stopped(req->q))
				blk_mq_kick_requeue_list(req->q);
			spin_unlock_irqrestore(req->q->queue_lock, flags);
634
			return;
635
		}
636

637
		if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
638
			if (cmd_rq->ctx == CMD_CTX_CANCELLED)
639 640 641
				error = -EINTR;
			else
				error = status;
642
		} else {
643
			error = nvme_error_status(status);
644
		}
645 646
	}

K
Keith Busch 已提交
647 648 649 650
	if (req->cmd_type == REQ_TYPE_DRV_PRIV) {
		u32 result = le32_to_cpup(&cqe->result);
		req->special = (void *)(uintptr_t)result;
	}
M
Matias Bjørling 已提交
651 652

	if (cmd_rq->aborted)
653
		dev_warn(nvmeq->dev->dev,
M
Matias Bjørling 已提交
654
			"completing aborted command with status:%04x\n",
655
			error);
M
Matias Bjørling 已提交
656

657 658
	nvme_unmap_data(nvmeq->dev, iod);
	blk_mq_complete_request(req, error);
M
Matthew Wilcox 已提交
659 660
}

661 662
static bool nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod,
		int total_len)
M
Matthew Wilcox 已提交
663
{
664
	struct dma_pool *pool;
665 666
	int length = total_len;
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
667 668
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
669
	u32 page_size = dev->ctrl.page_size;
670
	int offset = dma_addr & (page_size - 1);
671
	__le64 *prp_list;
672
	__le64 **list = iod_list(iod);
673
	dma_addr_t prp_dma;
674
	int nprps, i;
M
Matthew Wilcox 已提交
675

676
	length -= (page_size - offset);
M
Matthew Wilcox 已提交
677
	if (length <= 0)
678
		return true;
M
Matthew Wilcox 已提交
679

680
	dma_len -= (page_size - offset);
M
Matthew Wilcox 已提交
681
	if (dma_len) {
682
		dma_addr += (page_size - offset);
M
Matthew Wilcox 已提交
683 684 685 686 687 688
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

689
	if (length <= page_size) {
690
		iod->first_dma = dma_addr;
691
		return true;
692 693
	}

694
	nprps = DIV_ROUND_UP(length, page_size);
695 696
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
697
		iod->npages = 0;
698 699
	} else {
		pool = dev->prp_page_pool;
700
		iod->npages = 1;
701 702
	}

703
	prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
704
	if (!prp_list) {
705
		iod->first_dma = dma_addr;
706
		iod->npages = -1;
707
		return false;
708
	}
709 710
	list[0] = prp_list;
	iod->first_dma = prp_dma;
711 712
	i = 0;
	for (;;) {
713
		if (i == page_size >> 3) {
714
			__le64 *old_prp_list = prp_list;
715
			prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
716
			if (!prp_list)
717
				return false;
718
			list[iod->npages++] = prp_list;
719 720 721
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
722 723
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
724 725 726
		dma_len -= page_size;
		dma_addr += page_size;
		length -= page_size;
727 728 729 730 731 732 733 734
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
		BUG_ON(dma_len < 0);
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
735 736
	}

737
	return true;
M
Matthew Wilcox 已提交
738 739
}

C
Christoph Hellwig 已提交
740 741
static int nvme_map_data(struct nvme_dev *dev, struct nvme_iod *iod,
		struct nvme_command *cmnd)
742
{
C
Christoph Hellwig 已提交
743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
	struct request *req = iod_get_private(iod);
	struct request_queue *q = req->q;
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;
	int ret = BLK_MQ_RQ_QUEUE_ERROR;

	sg_init_table(iod->sg, req->nr_phys_segments);
	iod->nents = blk_rq_map_sg(q, req, iod->sg);
	if (!iod->nents)
		goto out;

	ret = BLK_MQ_RQ_QUEUE_BUSY;
	if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir))
		goto out;

	if (!nvme_setup_prps(dev, iod, blk_rq_bytes(req)))
		goto out_unmap;

	ret = BLK_MQ_RQ_QUEUE_ERROR;
	if (blk_integrity_rq(req)) {
		if (blk_rq_count_integrity_sg(q, req->bio) != 1)
			goto out_unmap;

		sg_init_table(iod->meta_sg, 1);
		if (blk_rq_map_integrity_sg(q, req->bio, iod->meta_sg) != 1)
			goto out_unmap;
769

C
Christoph Hellwig 已提交
770 771 772 773 774
		if (rq_data_dir(req))
			nvme_dif_remap(req, nvme_dif_prep);

		if (!dma_map_sg(dev->dev, iod->meta_sg, 1, dma_dir))
			goto out_unmap;
775 776
	}

C
Christoph Hellwig 已提交
777 778 779 780 781 782 783 784 785 786
	cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
	cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
	if (blk_integrity_rq(req))
		cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
	return BLK_MQ_RQ_QUEUE_OK;

out_unmap:
	dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
out:
	return ret;
787 788
}

789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_iod *iod)
{
	struct request *req = iod_get_private(iod);
	enum dma_data_direction dma_dir = rq_data_dir(req) ?
			DMA_TO_DEVICE : DMA_FROM_DEVICE;

	if (iod->nents) {
		dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
		if (blk_integrity_rq(req)) {
			if (!rq_data_dir(req))
				nvme_dif_remap(req, nvme_dif_complete);
			dma_unmap_sg(dev->dev, iod->meta_sg, 1, dma_dir);
		}
	}

	nvme_free_iod(dev, iod);
}

M
Matias Bjørling 已提交
807 808 809 810 811
/*
 * We reuse the small pool to allocate the 16-byte range here as it is not
 * worth having a special pool for these or additional cases to handle freeing
 * the iod.
 */
C
Christoph Hellwig 已提交
812 813
static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
		struct nvme_iod *iod, struct nvme_command *cmnd)
814
{
C
Christoph Hellwig 已提交
815 816 817 818 819 820 821 822 823
	struct request *req = iod_get_private(iod);
	struct nvme_dsm_range *range;

	range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
						&iod->first_dma);
	if (!range)
		return BLK_MQ_RQ_QUEUE_BUSY;
	iod_list(iod)[0] = (__le64 *)range;
	iod->npages = 0;
824 825

	range->cattr = cpu_to_le32(0);
M
Matias Bjørling 已提交
826 827
	range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
	range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
828

C
Christoph Hellwig 已提交
829 830 831 832 833 834 835
	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->dsm.opcode = nvme_cmd_dsm;
	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
	cmnd->dsm.nr = 0;
	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
	return BLK_MQ_RQ_QUEUE_OK;
836 837
}

838 839 840
/*
 * NOTE: ns is NULL when called on the admin queue.
 */
M
Matias Bjørling 已提交
841 842
static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
			 const struct blk_mq_queue_data *bd)
843
{
M
Matias Bjørling 已提交
844 845
	struct nvme_ns *ns = hctx->queue->queuedata;
	struct nvme_queue *nvmeq = hctx->driver_data;
846
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
847 848
	struct request *req = bd->rq;
	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
849
	struct nvme_iod *iod;
C
Christoph Hellwig 已提交
850 851
	struct nvme_command cmnd;
	int ret = BLK_MQ_RQ_QUEUE_OK;
852

K
Keith Busch 已提交
853 854 855 856 857
	/*
	 * If formated with metadata, require the block layer provide a buffer
	 * unless this namespace is formated such that the metadata can be
	 * stripped/generated by the controller with PRACT=1.
	 */
858
	if (ns && ns->ms && !blk_integrity_rq(req)) {
859 860
		if (!(ns->pi_type && ns->ms == 8) &&
					req->cmd_type != REQ_TYPE_DRV_PRIV) {
861
			blk_mq_complete_request(req, -EFAULT);
K
Keith Busch 已提交
862 863 864 865
			return BLK_MQ_RQ_QUEUE_OK;
		}
	}

866
	iod = nvme_alloc_iod(req, dev, GFP_ATOMIC);
867
	if (!iod)
868
		return BLK_MQ_RQ_QUEUE_BUSY;
M
Matias Bjørling 已提交
869 870

	if (req->cmd_flags & REQ_DISCARD) {
C
Christoph Hellwig 已提交
871 872 873 874 875 876 877 878
		ret = nvme_setup_discard(nvmeq, ns, iod, &cmnd);
	} else {
		if (req->cmd_type == REQ_TYPE_DRV_PRIV)
			memcpy(&cmnd, req->cmd, sizeof(cmnd));
		else if (req->cmd_flags & REQ_FLUSH)
			nvme_setup_flush(ns, &cmnd);
		else
			nvme_setup_rw(ns, req, &cmnd);
M
Matias Bjørling 已提交
879

C
Christoph Hellwig 已提交
880 881
		if (req->nr_phys_segments)
			ret = nvme_map_data(dev, iod, &cmnd);
882
	}
883

C
Christoph Hellwig 已提交
884 885 886 887
	if (ret)
		goto out;

	cmnd.common.command_id = req->tag;
K
Keith Busch 已提交
888
	nvme_set_info(cmd, iod, req_completion);
M
Matias Bjørling 已提交
889

C
Christoph Hellwig 已提交
890 891
	spin_lock_irq(&nvmeq->q_lock);
	__nvme_submit_cmd(nvmeq, &cmnd);
M
Matias Bjørling 已提交
892 893 894
	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
	return BLK_MQ_RQ_QUEUE_OK;
C
Christoph Hellwig 已提交
895
out:
896
	nvme_free_iod(dev, iod);
C
Christoph Hellwig 已提交
897
	return ret;
M
Matthew Wilcox 已提交
898 899
}

J
Jens Axboe 已提交
900
static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
M
Matthew Wilcox 已提交
901
{
M
Matthew Wilcox 已提交
902
	u16 head, phase;
M
Matthew Wilcox 已提交
903 904

	head = nvmeq->cq_head;
M
Matthew Wilcox 已提交
905
	phase = nvmeq->cq_phase;
M
Matthew Wilcox 已提交
906 907

	for (;;) {
908 909
		void *ctx;
		nvme_completion_fn fn;
M
Matthew Wilcox 已提交
910
		struct nvme_completion cqe = nvmeq->cqes[head];
M
Matthew Wilcox 已提交
911
		if ((le16_to_cpu(cqe.status) & 1) != phase)
M
Matthew Wilcox 已提交
912 913 914 915
			break;
		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
		if (++head == nvmeq->q_depth) {
			head = 0;
M
Matthew Wilcox 已提交
916
			phase = !phase;
M
Matthew Wilcox 已提交
917
		}
J
Jens Axboe 已提交
918 919
		if (tag && *tag == cqe.command_id)
			*tag = -1;
M
Matias Bjørling 已提交
920
		ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
921
		fn(nvmeq, ctx, &cqe);
M
Matthew Wilcox 已提交
922 923 924 925 926 927 928 929
	}

	/* If the controller ignores the cq head doorbell and continuously
	 * writes to the queue, it is theoretically possible to wrap around
	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
	 * requires that 0.1% of your interrupts are handled, so this isn't
	 * a big problem.
	 */
M
Matthew Wilcox 已提交
930
	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
J
Jens Axboe 已提交
931
		return;
M
Matthew Wilcox 已提交
932

933 934
	if (likely(nvmeq->cq_vector >= 0))
		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
M
Matthew Wilcox 已提交
935
	nvmeq->cq_head = head;
M
Matthew Wilcox 已提交
936
	nvmeq->cq_phase = phase;
M
Matthew Wilcox 已提交
937

938
	nvmeq->cqe_seen = 1;
J
Jens Axboe 已提交
939 940 941 942 943
}

static void nvme_process_cq(struct nvme_queue *nvmeq)
{
	__nvme_process_cq(nvmeq, NULL);
M
Matthew Wilcox 已提交
944 945 946
}

static irqreturn_t nvme_irq(int irq, void *data)
947 948 949 950
{
	irqreturn_t result;
	struct nvme_queue *nvmeq = data;
	spin_lock(&nvmeq->q_lock);
951 952 953
	nvme_process_cq(nvmeq);
	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
	nvmeq->cqe_seen = 0;
954 955 956 957 958 959 960 961 962 963 964 965 966
	spin_unlock(&nvmeq->q_lock);
	return result;
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
		return IRQ_NONE;
	return IRQ_WAKE_THREAD;
}

J
Jens Axboe 已提交
967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
{
	struct nvme_queue *nvmeq = hctx->driver_data;

	if ((le16_to_cpu(nvmeq->cqes[nvmeq->cq_head].status) & 1) ==
	    nvmeq->cq_phase) {
		spin_lock_irq(&nvmeq->q_lock);
		__nvme_process_cq(nvmeq, &tag);
		spin_unlock_irq(&nvmeq->q_lock);

		if (tag == -1)
			return 1;
	}

	return 0;
}

M
Matias Bjørling 已提交
984 985 986 987 988 989 990
static int nvme_submit_async_admin_req(struct nvme_dev *dev)
{
	struct nvme_queue *nvmeq = dev->queues[0];
	struct nvme_command c;
	struct nvme_cmd_info *cmd_info;
	struct request *req;

991
	req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE,
992
			BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
993 994
	if (IS_ERR(req))
		return PTR_ERR(req);
M
Matias Bjørling 已提交
995

K
Keith Busch 已提交
996
	req->cmd_flags |= REQ_NO_TIMEOUT;
M
Matias Bjørling 已提交
997
	cmd_info = blk_mq_rq_to_pdu(req);
998
	nvme_set_info(cmd_info, NULL, async_req_completion);
M
Matias Bjørling 已提交
999 1000 1001 1002 1003

	memset(&c, 0, sizeof(c));
	c.common.opcode = nvme_admin_async_event;
	c.common.command_id = req->tag;

1004
	blk_mq_free_request(req);
1005 1006
	__nvme_submit_cmd(nvmeq, &c);
	return 0;
M
Matias Bjørling 已提交
1007 1008 1009
}

static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
K
Keith Busch 已提交
1010 1011 1012
			struct nvme_command *cmd,
			struct async_cmd_info *cmdinfo, unsigned timeout)
{
M
Matias Bjørling 已提交
1013 1014 1015
	struct nvme_queue *nvmeq = dev->queues[0];
	struct request *req;
	struct nvme_cmd_info *cmd_rq;
K
Keith Busch 已提交
1016

1017
	req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE, 0);
1018 1019
	if (IS_ERR(req))
		return PTR_ERR(req);
M
Matias Bjørling 已提交
1020 1021 1022 1023 1024

	req->timeout = timeout;
	cmd_rq = blk_mq_rq_to_pdu(req);
	cmdinfo->req = req;
	nvme_set_info(cmd_rq, cmdinfo, async_completion);
K
Keith Busch 已提交
1025
	cmdinfo->status = -EINTR;
M
Matias Bjørling 已提交
1026 1027 1028

	cmd->common.command_id = req->tag;

1029 1030
	nvme_submit_cmd(nvmeq, cmd);
	return 0;
K
Keith Busch 已提交
1031 1032
}

M
Matthew Wilcox 已提交
1033 1034 1035 1036 1037 1038 1039 1040
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

1041
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1042 1043 1044 1045 1046 1047 1048 1049
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	struct nvme_command c;
	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;

1050 1051 1052 1053
	/*
	 * Note: we (ab)use the fact the the prp fields survive if no data
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1054 1055 1056 1057 1058 1059 1060 1061
	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);

1062
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1063 1064 1065 1066 1067 1068 1069 1070
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	struct nvme_command c;
	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;

1071 1072 1073 1074
	/*
	 * Note: we (ab)use the fact the the prp fields survive if no data
	 * is attached to the request.
	 */
M
Matthew Wilcox 已提交
1075 1076 1077 1078 1079 1080 1081 1082
	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

1083
	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
M
Matthew Wilcox 已提交
1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

K
Keith Busch 已提交
1096
/**
M
Matias Bjørling 已提交
1097
 * nvme_abort_req - Attempt aborting a request
K
Keith Busch 已提交
1098 1099 1100 1101
 *
 * Schedule controller reset if the command was already aborted once before and
 * still hasn't been returned to the driver, or if this is the admin queue.
 */
M
Matias Bjørling 已提交
1102
static void nvme_abort_req(struct request *req)
K
Keith Busch 已提交
1103
{
M
Matias Bjørling 已提交
1104 1105
	struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = cmd_rq->nvmeq;
K
Keith Busch 已提交
1106
	struct nvme_dev *dev = nvmeq->dev;
M
Matias Bjørling 已提交
1107 1108 1109
	struct request *abort_req;
	struct nvme_cmd_info *abort_cmd;
	struct nvme_command cmd;
K
Keith Busch 已提交
1110

M
Matias Bjørling 已提交
1111
	if (!nvmeq->qid || cmd_rq->aborted) {
1112 1113 1114 1115 1116 1117 1118
		spin_lock(&dev_list_lock);
		if (!__nvme_reset(dev)) {
			dev_warn(dev->dev,
				 "I/O %d QID %d timeout, reset controller\n",
				 req->tag, nvmeq->qid);
		}
		spin_unlock(&dev_list_lock);
K
Keith Busch 已提交
1119 1120 1121
		return;
	}

1122
	if (!dev->ctrl.abort_limit)
K
Keith Busch 已提交
1123 1124
		return;

1125
	abort_req = blk_mq_alloc_request(dev->ctrl.admin_q, WRITE,
1126
			BLK_MQ_REQ_NOWAIT);
1127
	if (IS_ERR(abort_req))
K
Keith Busch 已提交
1128 1129
		return;

M
Matias Bjørling 已提交
1130 1131 1132
	abort_cmd = blk_mq_rq_to_pdu(abort_req);
	nvme_set_info(abort_cmd, abort_req, abort_completion);

K
Keith Busch 已提交
1133 1134
	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
M
Matias Bjørling 已提交
1135
	cmd.abort.cid = req->tag;
K
Keith Busch 已提交
1136
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
M
Matias Bjørling 已提交
1137
	cmd.abort.command_id = abort_req->tag;
K
Keith Busch 已提交
1138

1139
	--dev->ctrl.abort_limit;
M
Matias Bjørling 已提交
1140
	cmd_rq->aborted = 1;
K
Keith Busch 已提交
1141

M
Matias Bjørling 已提交
1142
	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
K
Keith Busch 已提交
1143
							nvmeq->qid);
1144
	nvme_submit_cmd(dev->queues[0], &cmd);
K
Keith Busch 已提交
1145 1146
}

1147
static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
1148
{
M
Matias Bjørling 已提交
1149 1150 1151 1152
	struct nvme_queue *nvmeq = data;
	void *ctx;
	nvme_completion_fn fn;
	struct nvme_cmd_info *cmd;
K
Keith Busch 已提交
1153 1154 1155 1156
	struct nvme_completion cqe;

	if (!blk_mq_request_started(req))
		return;
1157

M
Matias Bjørling 已提交
1158
	cmd = blk_mq_rq_to_pdu(req);
1159

M
Matias Bjørling 已提交
1160 1161 1162
	if (cmd->ctx == CMD_CTX_CANCELLED)
		return;

K
Keith Busch 已提交
1163 1164 1165 1166 1167 1168
	if (blk_queue_dying(req->q))
		cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
	else
		cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);


M
Matias Bjørling 已提交
1169 1170 1171 1172
	dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
						req->tag, nvmeq->qid);
	ctx = cancel_cmd_info(cmd, &fn);
	fn(nvmeq, ctx, &cqe);
1173 1174
}

M
Matias Bjørling 已提交
1175
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
1176
{
M
Matias Bjørling 已提交
1177 1178 1179 1180 1181
	struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
	struct nvme_queue *nvmeq = cmd->nvmeq;

	dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
							nvmeq->qid);
1182
	spin_lock_irq(&nvmeq->q_lock);
1183
	nvme_abort_req(req);
1184
	spin_unlock_irq(&nvmeq->q_lock);
M
Matias Bjørling 已提交
1185

1186 1187 1188 1189 1190 1191
	/*
	 * The aborted req will be completed on receiving the abort req.
	 * We enable the timer again. If hit twice, it'll cause a device reset,
	 * as the device then is in a faulty state.
	 */
	return BLK_EH_RESET_TIMER;
M
Matias Bjørling 已提交
1192
}
1193

M
Matias Bjørling 已提交
1194 1195
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
1196 1197
	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
1198 1199
	if (nvmeq->sq_cmds)
		dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
1200 1201 1202 1203
					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
	kfree(nvmeq);
}

1204
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1205 1206 1207
{
	int i;

1208
	for (i = dev->queue_count - 1; i >= lowest; i--) {
M
Matias Bjørling 已提交
1209
		struct nvme_queue *nvmeq = dev->queues[i];
1210
		dev->queue_count--;
M
Matias Bjørling 已提交
1211
		dev->queues[i] = NULL;
K
Keith Busch 已提交
1212
		nvme_free_queue(nvmeq);
1213
	}
1214 1215
}

K
Keith Busch 已提交
1216 1217 1218 1219 1220
/**
 * nvme_suspend_queue - put queue into suspended state
 * @nvmeq - queue to suspend
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1221
{
K
Keith Busch 已提交
1222
	int vector;
M
Matthew Wilcox 已提交
1223

1224
	spin_lock_irq(&nvmeq->q_lock);
K
Keith Busch 已提交
1225 1226 1227 1228 1229
	if (nvmeq->cq_vector == -1) {
		spin_unlock_irq(&nvmeq->q_lock);
		return 1;
	}
	vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
K
Keith Busch 已提交
1230
	nvmeq->dev->online_queues--;
K
Keith Busch 已提交
1231
	nvmeq->cq_vector = -1;
1232 1233
	spin_unlock_irq(&nvmeq->q_lock);

1234 1235
	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
		blk_mq_freeze_queue_start(nvmeq->dev->ctrl.admin_q);
1236

M
Matthew Wilcox 已提交
1237 1238
	irq_set_affinity_hint(vector, NULL);
	free_irq(vector, nvmeq);
M
Matthew Wilcox 已提交
1239

K
Keith Busch 已提交
1240 1241
	return 0;
}
M
Matthew Wilcox 已提交
1242

K
Keith Busch 已提交
1243 1244
static void nvme_clear_queue(struct nvme_queue *nvmeq)
{
1245
	spin_lock_irq(&nvmeq->q_lock);
1246 1247
	if (nvmeq->tags && *nvmeq->tags)
		blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
1248
	spin_unlock_irq(&nvmeq->q_lock);
M
Matthew Wilcox 已提交
1249 1250
}

K
Keith Busch 已提交
1251 1252
static void nvme_disable_queue(struct nvme_dev *dev, int qid)
{
M
Matias Bjørling 已提交
1253
	struct nvme_queue *nvmeq = dev->queues[qid];
K
Keith Busch 已提交
1254 1255 1256 1257 1258 1259

	if (!nvmeq)
		return;
	if (nvme_suspend_queue(nvmeq))
		return;

K
Keith Busch 已提交
1260 1261
	/* Don't tell the adapter to delete the admin queue.
	 * Don't tell a removed adapter to delete IO queues. */
1262
	if (qid && readl(dev->bar + NVME_REG_CSTS) != -1) {
M
Matthew Wilcox 已提交
1263 1264 1265
		adapter_delete_sq(dev, qid);
		adapter_delete_cq(dev, qid);
	}
1266 1267 1268 1269

	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
M
Matthew Wilcox 已提交
1270 1271
}

1272 1273 1274 1275
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
				int entry_size)
{
	int q_depth = dev->q_depth;
1276 1277
	unsigned q_size_aligned = roundup(q_depth * entry_size,
					  dev->ctrl.page_size);
1278 1279

	if (q_size_aligned * nr_io_queues > dev->cmb_size) {
1280
		u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
1281
		mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
1282
		q_depth = div_u64(mem_per_q, entry_size);
1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299

		/*
		 * Ensure the reduced q_depth is above some threshold where it
		 * would be better to map queues in system memory with the
		 * original depth
		 */
		if (q_depth < 64)
			return -ENOMEM;
	}

	return q_depth;
}

static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
				int qid, int depth)
{
	if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
1300 1301
		unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
						      dev->ctrl.page_size);
1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
		nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset;
		nvmeq->sq_cmds_io = dev->cmb + offset;
	} else {
		nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
					&nvmeq->sq_dma_addr, GFP_KERNEL);
		if (!nvmeq->sq_cmds)
			return -ENOMEM;
	}

	return 0;
}

M
Matthew Wilcox 已提交
1314
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
K
Keith Busch 已提交
1315
							int depth)
M
Matthew Wilcox 已提交
1316
{
M
Matias Bjørling 已提交
1317
	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
M
Matthew Wilcox 已提交
1318 1319 1320
	if (!nvmeq)
		return NULL;

1321
	nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
J
Joe Perches 已提交
1322
					  &nvmeq->cq_dma_addr, GFP_KERNEL);
M
Matthew Wilcox 已提交
1323 1324 1325
	if (!nvmeq->cqes)
		goto free_nvmeq;

1326
	if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth))
M
Matthew Wilcox 已提交
1327 1328
		goto free_cqdma;

1329
	nvmeq->q_dmadev = dev->dev;
M
Matthew Wilcox 已提交
1330
	nvmeq->dev = dev;
1331
	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
1332
			dev->ctrl.instance, qid);
M
Matthew Wilcox 已提交
1333 1334
	spin_lock_init(&nvmeq->q_lock);
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1335
	nvmeq->cq_phase = 1;
1336
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
M
Matthew Wilcox 已提交
1337
	nvmeq->q_depth = depth;
K
Keith Busch 已提交
1338
	nvmeq->qid = qid;
1339
	nvmeq->cq_vector = -1;
M
Matias Bjørling 已提交
1340
	dev->queues[qid] = nvmeq;
M
Matthew Wilcox 已提交
1341

1342 1343 1344 1345
	/* make sure queue descriptor is set before queue count, for kthread */
	mb();
	dev->queue_count++;

M
Matthew Wilcox 已提交
1346 1347 1348
	return nvmeq;

 free_cqdma:
1349
	dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
M
Matthew Wilcox 已提交
1350 1351 1352 1353 1354 1355
							nvmeq->cq_dma_addr);
 free_nvmeq:
	kfree(nvmeq);
	return NULL;
}

1356 1357 1358
static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
							const char *name)
{
1359 1360
	if (use_threaded_interrupts)
		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
1361
					nvme_irq_check, nvme_irq, IRQF_SHARED,
1362
					name, nvmeq);
1363
	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
1364
				IRQF_SHARED, name, nvmeq);
1365 1366
}

1367
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1368
{
1369
	struct nvme_dev *dev = nvmeq->dev;
M
Matthew Wilcox 已提交
1370

1371
	spin_lock_irq(&nvmeq->q_lock);
1372 1373 1374
	nvmeq->sq_tail = 0;
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1375
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1376
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
K
Keith Busch 已提交
1377
	dev->online_queues++;
1378
	spin_unlock_irq(&nvmeq->q_lock);
1379 1380 1381 1382 1383 1384
}

static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1385

K
Keith Busch 已提交
1386
	nvmeq->cq_vector = qid - 1;
M
Matthew Wilcox 已提交
1387 1388
	result = adapter_alloc_cq(dev, qid, nvmeq);
	if (result < 0)
1389
		return result;
M
Matthew Wilcox 已提交
1390 1391 1392 1393 1394

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
		goto release_cq;

1395
	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
M
Matthew Wilcox 已提交
1396 1397 1398
	if (result < 0)
		goto release_sq;

1399 1400
	nvme_init_queue(nvmeq, qid);
	return result;
M
Matthew Wilcox 已提交
1401 1402 1403 1404 1405

 release_sq:
	adapter_delete_sq(dev, qid);
 release_cq:
	adapter_delete_cq(dev, qid);
1406
	return result;
M
Matthew Wilcox 已提交
1407 1408
}

M
Matias Bjørling 已提交
1409
static struct blk_mq_ops nvme_mq_admin_ops = {
1410
	.queue_rq	= nvme_queue_rq,
M
Matias Bjørling 已提交
1411 1412
	.map_queue	= blk_mq_map_queue,
	.init_hctx	= nvme_admin_init_hctx,
1413
	.exit_hctx      = nvme_admin_exit_hctx,
M
Matias Bjørling 已提交
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
	.init_request	= nvme_admin_init_request,
	.timeout	= nvme_timeout,
};

static struct blk_mq_ops nvme_mq_ops = {
	.queue_rq	= nvme_queue_rq,
	.map_queue	= blk_mq_map_queue,
	.init_hctx	= nvme_init_hctx,
	.init_request	= nvme_init_request,
	.timeout	= nvme_timeout,
J
Jens Axboe 已提交
1424
	.poll		= nvme_poll,
M
Matias Bjørling 已提交
1425 1426
};

1427 1428
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
1429 1430
	if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
		blk_cleanup_queue(dev->ctrl.admin_q);
1431 1432 1433 1434
		blk_mq_free_tag_set(&dev->admin_tagset);
	}
}

M
Matias Bjørling 已提交
1435 1436
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
1437
	if (!dev->ctrl.admin_q) {
M
Matias Bjørling 已提交
1438 1439 1440
		dev->admin_tagset.ops = &nvme_mq_admin_ops;
		dev->admin_tagset.nr_hw_queues = 1;
		dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
1441
		dev->admin_tagset.reserved_tags = 1;
M
Matias Bjørling 已提交
1442
		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
1443
		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
1444
		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
M
Matias Bjørling 已提交
1445 1446 1447 1448 1449
		dev->admin_tagset.driver_data = dev;

		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
			return -ENOMEM;

1450 1451
		dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
		if (IS_ERR(dev->ctrl.admin_q)) {
M
Matias Bjørling 已提交
1452 1453 1454
			blk_mq_free_tag_set(&dev->admin_tagset);
			return -ENOMEM;
		}
1455
		if (!blk_get_queue(dev->ctrl.admin_q)) {
1456
			nvme_dev_remove_admin(dev);
1457
			dev->ctrl.admin_q = NULL;
1458 1459
			return -ENODEV;
		}
K
Keith Busch 已提交
1460
	} else
1461
		blk_mq_unfreeze_queue(dev->ctrl.admin_q);
M
Matias Bjørling 已提交
1462 1463 1464 1465

	return 0;
}

1466
static int nvme_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1467
{
1468
	int result;
M
Matthew Wilcox 已提交
1469
	u32 aqa;
1470
	u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
M
Matthew Wilcox 已提交
1471 1472
	struct nvme_queue *nvmeq;

1473
	dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ?
1474 1475
						NVME_CAP_NSSRC(cap) : 0;

1476 1477 1478
	if (dev->subsystem &&
	    (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
		writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
1479

1480
	result = nvme_disable_ctrl(&dev->ctrl, cap);
1481 1482
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1483

M
Matias Bjørling 已提交
1484
	nvmeq = dev->queues[0];
1485
	if (!nvmeq) {
K
Keith Busch 已提交
1486
		nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
1487 1488 1489
		if (!nvmeq)
			return -ENOMEM;
	}
M
Matthew Wilcox 已提交
1490 1491 1492 1493

	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

1494 1495 1496
	writel(aqa, dev->bar + NVME_REG_AQA);
	lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
	lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
M
Matthew Wilcox 已提交
1497

1498
	result = nvme_enable_ctrl(&dev->ctrl, cap);
1499
	if (result)
M
Matias Bjørling 已提交
1500 1501
		goto free_nvmeq;

K
Keith Busch 已提交
1502
	nvmeq->cq_vector = 0;
1503
	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1504 1505
	if (result) {
		nvmeq->cq_vector = -1;
K
Keith Busch 已提交
1506
		goto free_nvmeq;
1507
	}
1508

M
Matthew Wilcox 已提交
1509
	return result;
M
Matias Bjørling 已提交
1510 1511 1512 1513

 free_nvmeq:
	nvme_free_queues(dev, 0);
	return result;
M
Matthew Wilcox 已提交
1514 1515
}

1516 1517 1518 1519 1520
static int nvme_subsys_reset(struct nvme_dev *dev)
{
	if (!dev->subsystem)
		return -ENOTTY;

1521
	writel(0x4E564D65, dev->bar + NVME_REG_NSSR); /* "NVMe" */
1522 1523 1524
	return 0;
}

1525 1526
static int nvme_kthread(void *data)
{
1527
	struct nvme_dev *dev, *next;
1528 1529

	while (!kthread_should_stop()) {
1530
		set_current_state(TASK_INTERRUPTIBLE);
1531
		spin_lock(&dev_list_lock);
1532
		list_for_each_entry_safe(dev, next, &dev_list, node) {
1533
			int i;
1534
			u32 csts = readl(dev->bar + NVME_REG_CSTS);
1535 1536 1537

			if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) ||
							csts & NVME_CSTS_CFS) {
1538 1539 1540
				if (!__nvme_reset(dev)) {
					dev_warn(dev->dev,
						"Failed status: %x, reset controller\n",
1541
						readl(dev->bar + NVME_REG_CSTS));
1542
				}
1543 1544
				continue;
			}
1545
			for (i = 0; i < dev->queue_count; i++) {
M
Matias Bjørling 已提交
1546
				struct nvme_queue *nvmeq = dev->queues[i];
1547 1548
				if (!nvmeq)
					continue;
1549
				spin_lock_irq(&nvmeq->q_lock);
1550
				nvme_process_cq(nvmeq);
K
Keith Busch 已提交
1551

1552
				while (i == 0 && dev->ctrl.event_limit > 0) {
M
Matias Bjørling 已提交
1553
					if (nvme_submit_async_admin_req(dev))
K
Keith Busch 已提交
1554
						break;
1555
					dev->ctrl.event_limit--;
K
Keith Busch 已提交
1556
				}
1557 1558 1559 1560
				spin_unlock_irq(&nvmeq->q_lock);
			}
		}
		spin_unlock(&dev_list_lock);
1561
		schedule_timeout(round_jiffies_relative(HZ));
1562 1563 1564 1565
	}
	return 0;
}

K
Keith Busch 已提交
1566
static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
M
Matthew Wilcox 已提交
1567 1568 1569
{
	struct nvme_ns *ns;
	struct gendisk *disk;
1570
	int node = dev_to_node(dev->dev);
M
Matthew Wilcox 已提交
1571

M
Matias Bjørling 已提交
1572
	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
1573
	if (!ns)
K
Keith Busch 已提交
1574 1575
		return;

M
Matias Bjørling 已提交
1576
	ns->queue = blk_mq_init_queue(&dev->tagset);
1577
	if (IS_ERR(ns->queue))
M
Matthew Wilcox 已提交
1578
		goto out_free_ns;
M
Matthew Wilcox 已提交
1579 1580
	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
1581
	ns->ctrl = &dev->ctrl;
M
Matthew Wilcox 已提交
1582 1583
	ns->queue->queuedata = ns;

M
Matias Bjørling 已提交
1584
	disk = alloc_disk_node(0, node);
M
Matthew Wilcox 已提交
1585 1586
	if (!disk)
		goto out_free_queue;
M
Matias Bjørling 已提交
1587

1588
	kref_init(&ns->kref);
1589
	ns->ns_id = nsid;
M
Matthew Wilcox 已提交
1590
	ns->disk = disk;
K
Keith Busch 已提交
1591 1592 1593
	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
	list_add_tail(&ns->list, &dev->namespaces);

1594
	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
K
Keith Busch 已提交
1595
	if (dev->max_hw_sectors) {
K
Keith Busch 已提交
1596
		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
K
Keith Busch 已提交
1597
		blk_queue_max_segments(ns->queue,
1598
			(dev->max_hw_sectors / (dev->ctrl.page_size >> 9)) + 1);
K
Keith Busch 已提交
1599
	}
M
Matias Bjørling 已提交
1600 1601
	if (dev->stripe_size)
		blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
1602
	if (dev->ctrl.vwc & NVME_CTRL_VWC_PRESENT)
1603
		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
1604
	blk_queue_virt_boundary(ns->queue, dev->ctrl.page_size - 1);
M
Matthew Wilcox 已提交
1605 1606

	disk->major = nvme_major;
1607
	disk->first_minor = 0;
M
Matthew Wilcox 已提交
1608 1609 1610
	disk->fops = &nvme_fops;
	disk->private_data = ns;
	disk->queue = ns->queue;
1611
	disk->driverfs_dev = dev->device;
1612
	disk->flags = GENHD_FL_EXT_DEVT;
1613
	sprintf(disk->disk_name, "nvme%dn%d", dev->ctrl.instance, nsid);
M
Matthew Wilcox 已提交
1614

K
Keith Busch 已提交
1615 1616 1617 1618 1619 1620 1621
	/*
	 * Initialize capacity to 0 until we establish the namespace format and
	 * setup integrity extentions if necessary. The revalidate_disk after
	 * add_disk allows the driver to register with integrity if the format
	 * requires it.
	 */
	set_capacity(disk, 0);
K
Keith Busch 已提交
1622 1623 1624
	if (nvme_revalidate_disk(ns->disk))
		goto out_free_disk;

1625
	kref_get(&dev->ctrl.kref);
M
Matias Bjørling 已提交
1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637
	if (ns->type != NVME_NS_LIGHTNVM) {
		add_disk(ns->disk);
		if (ns->ms) {
			struct block_device *bd = bdget_disk(ns->disk, 0);
			if (!bd)
				return;
			if (blkdev_get(bd, FMODE_READ, NULL)) {
				bdput(bd);
				return;
			}
			blkdev_reread_part(bd);
			blkdev_put(bd, FMODE_READ);
1638 1639
		}
	}
K
Keith Busch 已提交
1640
	return;
K
Keith Busch 已提交
1641 1642 1643
 out_free_disk:
	kfree(disk);
	list_del(&ns->list);
M
Matthew Wilcox 已提交
1644 1645 1646 1647 1648 1649
 out_free_queue:
	blk_cleanup_queue(ns->queue);
 out_free_ns:
	kfree(ns);
}

1650 1651 1652 1653 1654 1655 1656
/*
 * Create I/O queues.  Failing to create an I/O queue is not an issue,
 * we can continue with less than the desired amount of queues, and
 * even a controller without I/O queues an still be used to issue
 * admin commands.  This might be useful to upgrade a buggy firmware
 * for example.
 */
K
Keith Busch 已提交
1657 1658
static void nvme_create_io_queues(struct nvme_dev *dev)
{
M
Matias Bjørling 已提交
1659
	unsigned i;
K
Keith Busch 已提交
1660

M
Matias Bjørling 已提交
1661
	for (i = dev->queue_count; i <= dev->max_qid; i++)
K
Keith Busch 已提交
1662
		if (!nvme_alloc_queue(dev, i, dev->q_depth))
K
Keith Busch 已提交
1663 1664
			break;

M
Matias Bjørling 已提交
1665
	for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
1666 1667
		if (nvme_create_queue(dev->queues[i], i)) {
			nvme_free_queues(dev, i);
K
Keith Busch 已提交
1668
			break;
1669
		}
K
Keith Busch 已提交
1670 1671
}

1672
static int set_queue_count(struct nvme_dev *dev, int count)
M
Matthew Wilcox 已提交
1673 1674 1675
{
	int status;
	u32 result;
1676
	u32 q_count = (count - 1) | ((count - 1) << 16);
M
Matthew Wilcox 已提交
1677

1678
	status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
1679
								&result);
M
Matthew Wilcox 已提交
1680 1681 1682
	if (status < 0)
		return status;
	if (status > 0) {
1683
		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
1684
		return 0;
M
Matthew Wilcox 已提交
1685
	}
M
Matthew Wilcox 已提交
1686 1687 1688
	return min(result & 0xffff, result >> 16) + 1;
}

1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700
static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
{
	u64 szu, size, offset;
	u32 cmbloc;
	resource_size_t bar_size;
	struct pci_dev *pdev = to_pci_dev(dev->dev);
	void __iomem *cmb;
	dma_addr_t dma_addr;

	if (!use_cmb_sqes)
		return NULL;

1701
	dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
1702 1703 1704
	if (!(NVME_CMB_SZ(dev->cmbsz)))
		return NULL;

1705
	cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740

	szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
	size = szu * NVME_CMB_SZ(dev->cmbsz);
	offset = szu * NVME_CMB_OFST(cmbloc);
	bar_size = pci_resource_len(pdev, NVME_CMB_BIR(cmbloc));

	if (offset > bar_size)
		return NULL;

	/*
	 * Controllers may support a CMB size larger than their BAR,
	 * for example, due to being behind a bridge. Reduce the CMB to
	 * the reported size of the BAR
	 */
	if (size > bar_size - offset)
		size = bar_size - offset;

	dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(cmbloc)) + offset;
	cmb = ioremap_wc(dma_addr, size);
	if (!cmb)
		return NULL;

	dev->cmb_dma_addr = dma_addr;
	dev->cmb_size = size;
	return cmb;
}

static inline void nvme_release_cmb(struct nvme_dev *dev)
{
	if (dev->cmb) {
		iounmap(dev->cmb);
		dev->cmb = NULL;
	}
}

K
Keith Busch 已提交
1741 1742
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
1743
	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
K
Keith Busch 已提交
1744 1745
}

1746
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1747
{
M
Matias Bjørling 已提交
1748
	struct nvme_queue *adminq = dev->queues[0];
1749
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
1750
	int result, i, vecs, nr_io_queues, size;
M
Matthew Wilcox 已提交
1751

K
Keith Busch 已提交
1752
	nr_io_queues = num_possible_cpus();
1753
	result = set_queue_count(dev, nr_io_queues);
1754
	if (result <= 0)
M
Matthew Wilcox 已提交
1755
		return result;
1756 1757
	if (result < nr_io_queues)
		nr_io_queues = result;
M
Matthew Wilcox 已提交
1758

1759 1760 1761 1762 1763 1764 1765 1766 1767
	if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
		result = nvme_cmb_qdepth(dev, nr_io_queues,
				sizeof(struct nvme_command));
		if (result > 0)
			dev->q_depth = result;
		else
			nvme_release_cmb(dev);
	}

K
Keith Busch 已提交
1768 1769
	size = db_bar_size(dev, nr_io_queues);
	if (size > 8192) {
1770
		iounmap(dev->bar);
K
Keith Busch 已提交
1771 1772 1773 1774 1775 1776 1777 1778
		do {
			dev->bar = ioremap(pci_resource_start(pdev, 0), size);
			if (dev->bar)
				break;
			if (!--nr_io_queues)
				return -ENOMEM;
			size = db_bar_size(dev, nr_io_queues);
		} while (1);
1779
		dev->dbs = dev->bar + 4096;
1780
		adminq->q_db = dev->dbs;
1781 1782
	}

K
Keith Busch 已提交
1783
	/* Deregister the admin queue's interrupt */
1784
	free_irq(dev->entry[0].vector, adminq);
K
Keith Busch 已提交
1785

1786 1787 1788 1789 1790 1791 1792
	/*
	 * If we enable msix early due to not intx, disable it again before
	 * setting up the full range we need.
	 */
	if (!pdev->irq)
		pci_disable_msix(pdev);

1793
	for (i = 0; i < nr_io_queues; i++)
M
Matthew Wilcox 已提交
1794
		dev->entry[i].entry = i;
1795 1796 1797 1798 1799 1800 1801 1802
	vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
	if (vecs < 0) {
		vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
		if (vecs < 0) {
			vecs = 1;
		} else {
			for (i = 0; i < vecs; i++)
				dev->entry[i].vector = i + pdev->irq;
R
Ramachandra Rao Gajula 已提交
1803 1804 1805
		}
	}

1806 1807 1808 1809 1810 1811 1812
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
	nr_io_queues = vecs;
K
Keith Busch 已提交
1813
	dev->max_qid = nr_io_queues;
1814

1815
	result = queue_request_irq(dev, adminq, adminq->irqname);
1816 1817
	if (result) {
		adminq->cq_vector = -1;
1818
		goto free_queues;
1819
	}
M
Matthew Wilcox 已提交
1820

1821
	/* Free previously allocated queues that are no longer usable */
K
Keith Busch 已提交
1822
	nvme_free_queues(dev, nr_io_queues + 1);
M
Matias Bjørling 已提交
1823
	nvme_create_io_queues(dev);
M
Matthew Wilcox 已提交
1824

1825
	return 0;
M
Matthew Wilcox 已提交
1826

1827
 free_queues:
1828
	nvme_free_queues(dev, 1);
1829
	return result;
M
Matthew Wilcox 已提交
1830 1831
}

K
Keith Busch 已提交
1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854
static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
{
	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);

	return nsa->ns_id - nsb->ns_id;
}

static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid)
{
	struct nvme_ns *ns;

	list_for_each_entry(ns, &dev->namespaces, list) {
		if (ns->ns_id == nsid)
			return ns;
		if (ns->ns_id > nsid)
			break;
	}
	return NULL;
}

static inline bool nvme_io_incapable(struct nvme_dev *dev)
{
1855 1856 1857
	return (!dev->bar ||
		readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_CFS ||
		dev->online_queues < 2);
K
Keith Busch 已提交
1858 1859 1860 1861
}

static void nvme_ns_remove(struct nvme_ns *ns)
{
1862 1863
	bool kill = nvme_io_incapable(to_nvme_dev(ns->ctrl)) &&
			!blk_queue_dying(ns->queue);
K
Keith Busch 已提交
1864 1865 1866

	if (kill)
		blk_set_queue_dying(ns->queue);
1867
	if (ns->disk->flags & GENHD_FL_UP)
K
Keith Busch 已提交
1868 1869 1870 1871
		del_gendisk(ns->disk);
	if (kill || !blk_queue_dying(ns->queue)) {
		blk_mq_abort_requeue_list(ns->queue);
		blk_cleanup_queue(ns->queue);
1872 1873
	}
	list_del_init(&ns->list);
1874
	nvme_put_ns(ns);
K
Keith Busch 已提交
1875 1876 1877 1878 1879 1880 1881 1882 1883 1884
}

static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn)
{
	struct nvme_ns *ns, *next;
	unsigned i;

	for (i = 1; i <= nn; i++) {
		ns = nvme_find_ns(dev, i);
		if (ns) {
1885
			if (revalidate_disk(ns->disk))
K
Keith Busch 已提交
1886 1887 1888 1889 1890
				nvme_ns_remove(ns);
		} else
			nvme_alloc_ns(dev, i);
	}
	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
1891
		if (ns->ns_id > nn)
K
Keith Busch 已提交
1892 1893 1894 1895 1896
			nvme_ns_remove(ns);
	}
	list_sort(NULL, &dev->namespaces, ns_cmp);
}

1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912
static void nvme_set_irq_hints(struct nvme_dev *dev)
{
	struct nvme_queue *nvmeq;
	int i;

	for (i = 0; i < dev->online_queues; i++) {
		nvmeq = dev->queues[i];

		if (!nvmeq->tags || !(*nvmeq->tags))
			continue;

		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
					blk_mq_tags_cpumask(*nvmeq->tags));
	}
}

K
Keith Busch 已提交
1913 1914 1915 1916 1917 1918 1919
static void nvme_dev_scan(struct work_struct *work)
{
	struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work);
	struct nvme_id_ctrl *ctrl;

	if (!dev->tagset.tags)
		return;
1920
	if (nvme_identify_ctrl(&dev->ctrl, &ctrl))
K
Keith Busch 已提交
1921 1922 1923
		return;
	nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn));
	kfree(ctrl);
1924
	nvme_set_irq_hints(dev);
K
Keith Busch 已提交
1925 1926
}

1927 1928 1929 1930 1931 1932
/*
 * Return: error value if an error occurred setting up the queues or calling
 * Identify Device.  0 if these succeeded, even if adding some of the
 * namespaces failed.  At the moment, these failures are silent.  TBD which
 * failures should be reported.
 */
1933
static int nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1934
{
1935
	int res;
1936
	struct nvme_id_ctrl *ctrl;
1937
	int shift = NVME_CAP_MPSMIN(lo_hi_readq(dev->bar + NVME_REG_CAP)) + 12;
M
Matthew Wilcox 已提交
1938

1939
	res = nvme_identify_ctrl(&dev->ctrl, &ctrl);
M
Matthew Wilcox 已提交
1940
	if (res) {
1941
		dev_err(dev->dev, "Identify Controller failed (%d)\n", res);
K
Keith Busch 已提交
1942
		return -EIO;
M
Matthew Wilcox 已提交
1943 1944
	}

1945 1946 1947 1948 1949 1950
	dev->ctrl.oncs = le16_to_cpup(&ctrl->oncs);
	dev->ctrl.abort_limit = ctrl->acl + 1;
	dev->ctrl.vwc = ctrl->vwc;
	memcpy(dev->ctrl.serial, ctrl->sn, sizeof(ctrl->sn));
	memcpy(dev->ctrl.model, ctrl->mn, sizeof(ctrl->mn));
	memcpy(dev->ctrl.firmware_rev, ctrl->fr, sizeof(ctrl->fr));
1951
	if (ctrl->mdts)
K
Keith Busch 已提交
1952
		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
1953 1954
	else
		dev->max_hw_sectors = UINT_MAX;
1955 1956

	if ((dev->ctrl.quirks & NVME_QUIRK_STRIPE_SIZE) && ctrl->vs[3]) {
M
Matias Bjørling 已提交
1957 1958
		unsigned int max_hw_sectors;

1959
		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
M
Matias Bjørling 已提交
1960 1961 1962 1963 1964 1965 1966
		max_hw_sectors = dev->stripe_size >> (shift - 9);
		if (dev->max_hw_sectors) {
			dev->max_hw_sectors = min(max_hw_sectors,
							dev->max_hw_sectors);
		} else
			dev->max_hw_sectors = max_hw_sectors;
	}
1967
	kfree(ctrl);
M
Matias Bjørling 已提交
1968

1969 1970 1971 1972 1973 1974
	if (!dev->tagset.tags) {
		dev->tagset.ops = &nvme_mq_ops;
		dev->tagset.nr_hw_queues = dev->online_queues - 1;
		dev->tagset.timeout = NVME_IO_TIMEOUT;
		dev->tagset.numa_node = dev_to_node(dev->dev);
		dev->tagset.queue_depth =
M
Matias Bjørling 已提交
1975
				min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
1976 1977 1978
		dev->tagset.cmd_size = nvme_cmd_size(dev);
		dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
		dev->tagset.driver_data = dev;
M
Matthew Wilcox 已提交
1979

1980 1981 1982
		if (blk_mq_alloc_tag_set(&dev->tagset))
			return 0;
	}
K
Keith Busch 已提交
1983
	schedule_work(&dev->scan_work);
K
Keith Busch 已提交
1984
	return 0;
M
Matthew Wilcox 已提交
1985 1986
}

1987 1988
static int nvme_dev_map(struct nvme_dev *dev)
{
K
Keith Busch 已提交
1989
	u64 cap;
1990
	int bars, result = -ENOMEM;
1991
	struct pci_dev *pdev = to_pci_dev(dev->dev);
1992 1993 1994 1995 1996 1997 1998

	if (pci_enable_device_mem(pdev))
		return result;

	dev->entry[0].vector = pdev->irq;
	pci_set_master(pdev);
	bars = pci_select_bars(pdev, IORESOURCE_MEM);
1999 2000 2001
	if (!bars)
		goto disable_pci;

2002 2003 2004
	if (pci_request_selected_regions(pdev, bars, "nvme"))
		goto disable_pci;

2005 2006
	if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
	    dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
2007
		goto disable;
2008 2009 2010 2011

	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
	if (!dev->bar)
		goto disable;
2012

2013
	if (readl(dev->bar + NVME_REG_CSTS) == -1) {
K
Keith Busch 已提交
2014 2015 2016
		result = -ENODEV;
		goto unmap;
	}
2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027

	/*
	 * Some devices don't advertse INTx interrupts, pre-enable a single
	 * MSIX vec for setup. We'll adjust this later.
	 */
	if (!pdev->irq) {
		result = pci_enable_msix(pdev, dev->entry, 1);
		if (result < 0)
			goto unmap;
	}

2028 2029
	cap = lo_hi_readq(dev->bar + NVME_REG_CAP);

K
Keith Busch 已提交
2030 2031
	dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
	dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
2032 2033
	dev->dbs = dev->bar + 4096;
	if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2))
2034
		dev->cmb = nvme_map_cmb(dev);
2035 2036 2037

	return 0;

K
Keith Busch 已提交
2038 2039 2040
 unmap:
	iounmap(dev->bar);
	dev->bar = NULL;
2041 2042 2043 2044 2045 2046 2047 2048 2049
 disable:
	pci_release_regions(pdev);
 disable_pci:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
{
2050 2051 2052 2053 2054 2055
	struct pci_dev *pdev = to_pci_dev(dev->dev);

	if (pdev->msi_enabled)
		pci_disable_msi(pdev);
	else if (pdev->msix_enabled)
		pci_disable_msix(pdev);
2056 2057 2058 2059

	if (dev->bar) {
		iounmap(dev->bar);
		dev->bar = NULL;
2060
		pci_release_regions(pdev);
2061 2062
	}

2063 2064
	if (pci_is_enabled(pdev))
		pci_disable_device(pdev);
2065 2066
}

K
Keith Busch 已提交
2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083
struct nvme_delq_ctx {
	struct task_struct *waiter;
	struct kthread_worker *worker;
	atomic_t refcount;
};

static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
{
	dq->waiter = current;
	mb();

	for (;;) {
		set_current_state(TASK_KILLABLE);
		if (!atomic_read(&dq->refcount))
			break;
		if (!schedule_timeout(ADMIN_TIMEOUT) ||
					fatal_signal_pending(current)) {
K
Keith Busch 已提交
2084 2085 2086 2087 2088 2089 2090
			/*
			 * Disable the controller first since we can't trust it
			 * at this point, but leave the admin queue enabled
			 * until all queue deletion requests are flushed.
			 * FIXME: This may take a while if there are more h/w
			 * queues than admin tags.
			 */
K
Keith Busch 已提交
2091
			set_current_state(TASK_RUNNING);
2092
			nvme_disable_ctrl(&dev->ctrl,
2093
				lo_hi_readq(dev->bar + NVME_REG_CAP));
K
Keith Busch 已提交
2094
			nvme_clear_queue(dev->queues[0]);
K
Keith Busch 已提交
2095
			flush_kthread_worker(dq->worker);
K
Keith Busch 已提交
2096
			nvme_disable_queue(dev, 0);
K
Keith Busch 已提交
2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119
			return;
		}
	}
	set_current_state(TASK_RUNNING);
}

static void nvme_put_dq(struct nvme_delq_ctx *dq)
{
	atomic_dec(&dq->refcount);
	if (dq->waiter)
		wake_up_process(dq->waiter);
}

static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
{
	atomic_inc(&dq->refcount);
	return dq;
}

static void nvme_del_queue_end(struct nvme_queue *nvmeq)
{
	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
	nvme_put_dq(dq);
2120 2121 2122 2123

	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
K
Keith Busch 已提交
2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135
}

static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
						kthread_work_func_t fn)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);

	init_kthread_work(&nvmeq->cmdinfo.work, fn);
M
Matias Bjørling 已提交
2136 2137
	return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
								ADMIN_TIMEOUT);
K
Keith Busch 已提交
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184
}

static void nvme_del_cq_work_handler(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	nvme_del_queue_end(nvmeq);
}

static int nvme_delete_cq(struct nvme_queue *nvmeq)
{
	return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
						nvme_del_cq_work_handler);
}

static void nvme_del_sq_work_handler(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	int status = nvmeq->cmdinfo.status;

	if (!status)
		status = nvme_delete_cq(nvmeq);
	if (status)
		nvme_del_queue_end(nvmeq);
}

static int nvme_delete_sq(struct nvme_queue *nvmeq)
{
	return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
						nvme_del_sq_work_handler);
}

static void nvme_del_queue_start(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	if (nvme_delete_sq(nvmeq))
		nvme_del_queue_end(nvmeq);
}

static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	int i;
	DEFINE_KTHREAD_WORKER_ONSTACK(worker);
	struct nvme_delq_ctx dq;
	struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
2185
					&worker, "nvme%d", dev->ctrl.instance);
K
Keith Busch 已提交
2186 2187

	if (IS_ERR(kworker_task)) {
2188
		dev_err(dev->dev,
K
Keith Busch 已提交
2189 2190 2191 2192 2193 2194 2195 2196 2197 2198
			"Failed to create queue del task\n");
		for (i = dev->queue_count - 1; i > 0; i--)
			nvme_disable_queue(dev, i);
		return;
	}

	dq.waiter = NULL;
	atomic_set(&dq.refcount, 0);
	dq.worker = &worker;
	for (i = dev->queue_count - 1; i > 0; i--) {
M
Matias Bjørling 已提交
2199
		struct nvme_queue *nvmeq = dev->queues[i];
K
Keith Busch 已提交
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211

		if (nvme_suspend_queue(nvmeq))
			continue;
		nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
		nvmeq->cmdinfo.worker = dq.worker;
		init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
		queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
	}
	nvme_wait_dq(&dq, dev);
	kthread_stop(kworker_task);
}

2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231
/*
* Remove the node from the device list and check
* for whether or not we need to stop the nvme_thread.
*/
static void nvme_dev_list_remove(struct nvme_dev *dev)
{
	struct task_struct *tmp = NULL;

	spin_lock(&dev_list_lock);
	list_del_init(&dev->node);
	if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
		tmp = nvme_thread;
		nvme_thread = NULL;
	}
	spin_unlock(&dev_list_lock);

	if (tmp)
		kthread_stop(tmp);
}

2232 2233 2234 2235 2236 2237 2238
static void nvme_freeze_queues(struct nvme_dev *dev)
{
	struct nvme_ns *ns;

	list_for_each_entry(ns, &dev->namespaces, list) {
		blk_mq_freeze_queue_start(ns->queue);

2239
		spin_lock_irq(ns->queue->queue_lock);
2240
		queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
2241
		spin_unlock_irq(ns->queue->queue_lock);
2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259

		blk_mq_cancel_requeue_work(ns->queue);
		blk_mq_stop_hw_queues(ns->queue);
	}
}

static void nvme_unfreeze_queues(struct nvme_dev *dev)
{
	struct nvme_ns *ns;

	list_for_each_entry(ns, &dev->namespaces, list) {
		queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
		blk_mq_unfreeze_queue(ns->queue);
		blk_mq_start_stopped_hw_queues(ns->queue, true);
		blk_mq_kick_requeue_list(ns->queue);
	}
}

2260
static void nvme_dev_shutdown(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2261
{
2262
	int i;
2263
	u32 csts = -1;
2264

2265
	nvme_dev_list_remove(dev);
2266

2267 2268
	if (dev->bar) {
		nvme_freeze_queues(dev);
2269
		csts = readl(dev->bar + NVME_REG_CSTS);
2270
	}
2271
	if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
K
Keith Busch 已提交
2272
		for (i = dev->queue_count - 1; i >= 0; i--) {
M
Matias Bjørling 已提交
2273
			struct nvme_queue *nvmeq = dev->queues[i];
K
Keith Busch 已提交
2274 2275 2276 2277
			nvme_suspend_queue(nvmeq);
		}
	} else {
		nvme_disable_io_queues(dev);
2278
		nvme_shutdown_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2279 2280
		nvme_disable_queue(dev, 0);
	}
2281
	nvme_dev_unmap(dev);
2282 2283 2284

	for (i = dev->queue_count - 1; i >= 0; i--)
		nvme_clear_queue(dev->queues[i]);
2285 2286 2287 2288
}

static void nvme_dev_remove(struct nvme_dev *dev)
{
2289
	struct nvme_ns *ns, *next;
2290

2291
	list_for_each_entry_safe(ns, next, &dev->namespaces, list)
K
Keith Busch 已提交
2292
		nvme_ns_remove(ns);
M
Matthew Wilcox 已提交
2293 2294
}

M
Matthew Wilcox 已提交
2295 2296
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
2297
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
M
Matthew Wilcox 已提交
2298 2299 2300 2301
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

2302
	/* Optimisation for I/Os between 4k and 128k */
2303
	dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
2304 2305 2306 2307 2308
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2309 2310 2311 2312 2313 2314
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2315
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2316 2317
}

2318 2319 2320
static DEFINE_IDA(nvme_instance_ida);

static int nvme_set_instance(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2321
{
2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335
	int instance, error;

	do {
		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
			return -ENODEV;

		spin_lock(&dev_list_lock);
		error = ida_get_new(&nvme_instance_ida, &instance);
		spin_unlock(&dev_list_lock);
	} while (error == -EAGAIN);

	if (error)
		return -ENODEV;

2336
	dev->ctrl.instance = instance;
2337
	return 0;
M
Matthew Wilcox 已提交
2338 2339 2340 2341
}

static void nvme_release_instance(struct nvme_dev *dev)
{
2342
	spin_lock(&dev_list_lock);
2343
	ida_remove(&nvme_instance_ida, dev->ctrl.instance);
2344
	spin_unlock(&dev_list_lock);
M
Matthew Wilcox 已提交
2345 2346
}

2347
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
2348
{
2349
	struct nvme_dev *dev = to_nvme_dev(ctrl);
2350

2351
	put_device(dev->dev);
2352
	put_device(dev->device);
2353
	nvme_release_instance(dev);
2354 2355
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
2356 2357
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
2358 2359 2360 2361 2362 2363 2364
	kfree(dev->queues);
	kfree(dev->entry);
	kfree(dev);
}

static int nvme_dev_open(struct inode *inode, struct file *f)
{
2365 2366 2367 2368 2369 2370
	struct nvme_dev *dev;
	int instance = iminor(inode);
	int ret = -ENODEV;

	spin_lock(&dev_list_lock);
	list_for_each_entry(dev, &dev_list, node) {
2371 2372
		if (dev->ctrl.instance == instance) {
			if (!dev->ctrl.admin_q) {
2373 2374 2375
				ret = -EWOULDBLOCK;
				break;
			}
2376
			if (!kref_get_unless_zero(&dev->ctrl.kref))
2377 2378 2379 2380 2381 2382 2383 2384 2385
				break;
			f->private_data = dev;
			ret = 0;
			break;
		}
	}
	spin_unlock(&dev_list_lock);

	return ret;
2386 2387 2388 2389 2390
}

static int nvme_dev_release(struct inode *inode, struct file *f)
{
	struct nvme_dev *dev = f->private_data;
2391
	nvme_put_ctrl(&dev->ctrl);
2392 2393 2394 2395 2396 2397
	return 0;
}

static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
	struct nvme_dev *dev = f->private_data;
M
Matias Bjørling 已提交
2398 2399
	struct nvme_ns *ns;

2400 2401
	switch (cmd) {
	case NVME_IOCTL_ADMIN_CMD:
2402
		return nvme_user_cmd(&dev->ctrl, NULL, (void __user *)arg);
2403
	case NVME_IOCTL_IO_CMD:
M
Matias Bjørling 已提交
2404 2405 2406
		if (list_empty(&dev->namespaces))
			return -ENOTTY;
		ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
2407
		return nvme_user_cmd(&dev->ctrl, ns, (void __user *)arg);
2408 2409 2410
	case NVME_IOCTL_RESET:
		dev_warn(dev->dev, "resetting controller\n");
		return nvme_reset(dev);
2411 2412
	case NVME_IOCTL_SUBSYS_RESET:
		return nvme_subsys_reset(dev);
2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425
	default:
		return -ENOTTY;
	}
}

static const struct file_operations nvme_dev_fops = {
	.owner		= THIS_MODULE,
	.open		= nvme_dev_open,
	.release	= nvme_dev_release,
	.unlocked_ioctl	= nvme_dev_ioctl,
	.compat_ioctl	= nvme_dev_ioctl,
};

2426
static void nvme_probe_work(struct work_struct *work)
2427
{
2428
	struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
2429
	bool start_thread = false;
2430
	int result;
2431 2432 2433

	result = nvme_dev_map(dev);
	if (result)
2434
		goto out;
2435 2436 2437 2438 2439 2440

	result = nvme_configure_admin_queue(dev);
	if (result)
		goto unmap;

	spin_lock(&dev_list_lock);
2441 2442 2443 2444
	if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
		start_thread = true;
		nvme_thread = NULL;
	}
2445 2446 2447
	list_add(&dev->node, &dev_list);
	spin_unlock(&dev_list_lock);

2448 2449
	if (start_thread) {
		nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
2450
		wake_up_all(&nvme_kthread_wait);
2451 2452 2453 2454 2455 2456 2457
	} else
		wait_event_killable(nvme_kthread_wait, nvme_thread);

	if (IS_ERR_OR_NULL(nvme_thread)) {
		result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
		goto disable;
	}
M
Matias Bjørling 已提交
2458 2459

	nvme_init_queue(dev->queues[0], 0);
K
Keith Busch 已提交
2460 2461 2462
	result = nvme_alloc_admin_tags(dev);
	if (result)
		goto disable;
2463

2464
	result = nvme_setup_io_queues(dev);
2465
	if (result)
K
Keith Busch 已提交
2466
		goto free_tags;
2467

2468
	dev->ctrl.event_limit = 1;
2469

2470 2471 2472 2473
	/*
	 * Keep the controller around but remove all namespaces if we don't have
	 * any working I/O queue.
	 */
2474 2475 2476 2477 2478 2479 2480 2481 2482
	if (dev->online_queues < 2) {
		dev_warn(dev->dev, "IO queues not created\n");
		nvme_dev_remove(dev);
	} else {
		nvme_unfreeze_queues(dev);
		nvme_dev_add(dev);
	}

	return;
2483

K
Keith Busch 已提交
2484 2485
 free_tags:
	nvme_dev_remove_admin(dev);
2486 2487
	blk_put_queue(dev->ctrl.admin_q);
	dev->ctrl.admin_q = NULL;
2488
	dev->queues[0]->tags = NULL;
2489
 disable:
2490
	nvme_disable_queue(dev, 0);
2491
	nvme_dev_list_remove(dev);
2492 2493
 unmap:
	nvme_dev_unmap(dev);
2494 2495 2496
 out:
	if (!work_busy(&dev->reset_work))
		nvme_dead_ctrl(dev);
2497 2498
}

K
Keith Busch 已提交
2499 2500 2501
static int nvme_remove_dead_ctrl(void *arg)
{
	struct nvme_dev *dev = (struct nvme_dev *)arg;
2502
	struct pci_dev *pdev = to_pci_dev(dev->dev);
K
Keith Busch 已提交
2503 2504

	if (pci_get_drvdata(pdev))
2505
		pci_stop_and_remove_bus_device_locked(pdev);
2506
	nvme_put_ctrl(&dev->ctrl);
K
Keith Busch 已提交
2507 2508 2509
	return 0;
}

2510 2511 2512
static void nvme_dead_ctrl(struct nvme_dev *dev)
{
	dev_warn(dev->dev, "Device failed to resume\n");
2513
	kref_get(&dev->ctrl.kref);
2514
	if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
2515
						dev->ctrl.instance))) {
2516 2517
		dev_err(dev->dev,
			"Failed to start controller remove task\n");
2518
		nvme_put_ctrl(&dev->ctrl);
2519 2520 2521
	}
}

2522
static void nvme_reset_work(struct work_struct *ws)
K
Keith Busch 已提交
2523
{
2524
	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
2525 2526
	bool in_probe = work_busy(&dev->probe_work);

K
Keith Busch 已提交
2527
	nvme_dev_shutdown(dev);
2528 2529 2530 2531 2532 2533 2534 2535

	/* Synchronize with device probe so that work will see failure status
	 * and exit gracefully without trying to schedule another reset */
	flush_work(&dev->probe_work);

	/* Fail this device if reset occured during probe to avoid
	 * infinite initialization loops. */
	if (in_probe) {
2536
		nvme_dead_ctrl(dev);
2537
		return;
K
Keith Busch 已提交
2538
	}
2539 2540 2541
	/* Schedule device resume asynchronously so the reset work is available
	 * to cleanup errors that may occur during reinitialization */
	schedule_work(&dev->probe_work);
K
Keith Busch 已提交
2542 2543
}

2544
static int __nvme_reset(struct nvme_dev *dev)
T
Tejun Heo 已提交
2545
{
2546 2547 2548 2549 2550
	if (work_pending(&dev->reset_work))
		return -EBUSY;
	list_del_init(&dev->node);
	queue_work(nvme_workq, &dev->reset_work);
	return 0;
T
Tejun Heo 已提交
2551 2552
}

2553 2554
static int nvme_reset(struct nvme_dev *dev)
{
2555
	int ret;
2556

2557
	if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
2558 2559 2560
		return -ENODEV;

	spin_lock(&dev_list_lock);
2561
	ret = __nvme_reset(dev);
2562 2563 2564 2565
	spin_unlock(&dev_list_lock);

	if (!ret) {
		flush_work(&dev->reset_work);
2566
		flush_work(&dev->probe_work);
2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587
		return 0;
	}

	return ret;
}

static ssize_t nvme_sysfs_reset(struct device *dev,
				struct device_attribute *attr, const char *buf,
				size_t count)
{
	struct nvme_dev *ndev = dev_get_drvdata(dev);
	int ret;

	ret = nvme_reset(ndev);
	if (ret < 0)
		return ret;

	return count;
}
static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);

2588 2589 2590 2591 2592 2593
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
{
	*val = readl(to_nvme_dev(ctrl)->bar + off);
	return 0;
}

2594 2595 2596 2597 2598 2599
static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
{
	writel(val, to_nvme_dev(ctrl)->bar + off);
	return 0;
}

2600 2601
static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
	.reg_read32		= nvme_pci_reg_read32,
2602
	.reg_write32		= nvme_pci_reg_write32,
2603
	.free_ctrl		= nvme_pci_free_ctrl,
2604 2605
};

2606
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2607
{
M
Matias Bjørling 已提交
2608
	int node, result = -ENOMEM;
M
Matthew Wilcox 已提交
2609 2610
	struct nvme_dev *dev;

M
Matias Bjørling 已提交
2611 2612 2613 2614 2615
	node = dev_to_node(&pdev->dev);
	if (node == NUMA_NO_NODE)
		set_dev_node(&pdev->dev, 0);

	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2616 2617
	if (!dev)
		return -ENOMEM;
M
Matias Bjørling 已提交
2618 2619
	dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
							GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2620 2621
	if (!dev->entry)
		goto free;
M
Matias Bjørling 已提交
2622 2623
	dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
							GFP_KERNEL, node);
M
Matthew Wilcox 已提交
2624 2625 2626 2627
	if (!dev->queues)
		goto free;

	INIT_LIST_HEAD(&dev->namespaces);
2628
	INIT_WORK(&dev->reset_work, nvme_reset_work);
2629
	dev->dev = get_device(&pdev->dev);
K
Keith Busch 已提交
2630
	pci_set_drvdata(pdev, dev);
2631 2632 2633

	dev->ctrl.ops = &nvme_pci_ctrl_ops;
	dev->ctrl.dev = dev->dev;
2634
	dev->ctrl.quirks = id->driver_data;
2635

2636 2637
	result = nvme_set_instance(dev);
	if (result)
K
Keith Busch 已提交
2638
		goto put_pci;
M
Matthew Wilcox 已提交
2639

M
Matthew Wilcox 已提交
2640 2641
	result = nvme_setup_prp_pools(dev);
	if (result)
2642
		goto release;
M
Matthew Wilcox 已提交
2643

2644
	kref_init(&dev->ctrl.kref);
2645
	dev->device = device_create(nvme_class, &pdev->dev,
2646 2647
				MKDEV(nvme_char_major, dev->ctrl.instance),
				dev, "nvme%d", dev->ctrl.instance);
2648 2649
	if (IS_ERR(dev->device)) {
		result = PTR_ERR(dev->device);
2650
		goto release_pools;
2651 2652
	}
	get_device(dev->device);
2653 2654 2655 2656 2657
	dev_set_drvdata(dev->device, dev);

	result = device_create_file(dev->device, &dev_attr_reset_controller);
	if (result)
		goto put_dev;
2658

2659
	INIT_LIST_HEAD(&dev->node);
K
Keith Busch 已提交
2660
	INIT_WORK(&dev->scan_work, nvme_dev_scan);
2661
	INIT_WORK(&dev->probe_work, nvme_probe_work);
2662
	schedule_work(&dev->probe_work);
M
Matthew Wilcox 已提交
2663 2664
	return 0;

2665
 put_dev:
2666
	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance));
2667
	put_device(dev->device);
2668
 release_pools:
M
Matthew Wilcox 已提交
2669
	nvme_release_prp_pools(dev);
2670 2671
 release:
	nvme_release_instance(dev);
K
Keith Busch 已提交
2672
 put_pci:
2673
	put_device(dev->dev);
M
Matthew Wilcox 已提交
2674 2675 2676 2677 2678 2679 2680
 free:
	kfree(dev->queues);
	kfree(dev->entry);
	kfree(dev);
	return result;
}

2681 2682
static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
{
K
Keith Busch 已提交
2683
	struct nvme_dev *dev = pci_get_drvdata(pdev);
2684

K
Keith Busch 已提交
2685 2686 2687
	if (prepare)
		nvme_dev_shutdown(dev);
	else
2688
		schedule_work(&dev->probe_work);
2689 2690
}

2691 2692 2693 2694 2695 2696
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
	nvme_dev_shutdown(dev);
}

2697
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2698 2699
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2700 2701 2702 2703 2704 2705

	spin_lock(&dev_list_lock);
	list_del_init(&dev->node);
	spin_unlock(&dev_list_lock);

	pci_set_drvdata(pdev, NULL);
2706
	flush_work(&dev->probe_work);
K
Keith Busch 已提交
2707
	flush_work(&dev->reset_work);
K
Keith Busch 已提交
2708
	flush_work(&dev->scan_work);
2709
	device_remove_file(dev->device, &dev_attr_reset_controller);
2710
	nvme_dev_remove(dev);
2711
	nvme_dev_shutdown(dev);
M
Matias Bjørling 已提交
2712
	nvme_dev_remove_admin(dev);
2713
	device_destroy(nvme_class, MKDEV(nvme_char_major, dev->ctrl.instance));
2714
	nvme_free_queues(dev, 0);
2715
	nvme_release_cmb(dev);
K
Keith Busch 已提交
2716
	nvme_release_prp_pools(dev);
2717
	nvme_put_ctrl(&dev->ctrl);
M
Matthew Wilcox 已提交
2718 2719 2720 2721 2722 2723 2724 2725
}

/* These functions are yet to be implemented */
#define nvme_error_detected NULL
#define nvme_dump_registers NULL
#define nvme_link_reset NULL
#define nvme_slot_reset NULL
#define nvme_error_resume NULL
2726

2727
#ifdef CONFIG_PM_SLEEP
2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

	nvme_dev_shutdown(ndev);
	return 0;
}

static int nvme_resume(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

2742
	schedule_work(&ndev->probe_work);
K
Keith Busch 已提交
2743
	return 0;
2744
}
2745
#endif
2746 2747

static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
M
Matthew Wilcox 已提交
2748

2749
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
2750 2751 2752 2753 2754
	.error_detected	= nvme_error_detected,
	.mmio_enabled	= nvme_dump_registers,
	.link_reset	= nvme_link_reset,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
2755
	.reset_notify	= nvme_reset_notify,
M
Matthew Wilcox 已提交
2756 2757 2758 2759 2760
};

/* Move to pci_ids.h later */
#define PCI_CLASS_STORAGE_EXPRESS	0x010802

2761
static const struct pci_device_id nvme_id_table[] = {
2762 2763
	{ PCI_VDEVICE(INTEL, 0x0953),
		.driver_data = NVME_QUIRK_STRIPE_SIZE, },
M
Matthew Wilcox 已提交
2764
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
2765
	{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
M
Matthew Wilcox 已提交
2766 2767 2768 2769 2770 2771 2772 2773
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
2774
	.remove		= nvme_remove,
2775
	.shutdown	= nvme_shutdown,
2776 2777 2778
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
M
Matthew Wilcox 已提交
2779 2780 2781 2782 2783
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
2784
	int result;
2785

2786
	init_waitqueue_head(&nvme_kthread_wait);
M
Matthew Wilcox 已提交
2787

K
Keith Busch 已提交
2788 2789
	nvme_workq = create_singlethread_workqueue("nvme");
	if (!nvme_workq)
2790
		return -ENOMEM;
K
Keith Busch 已提交
2791

2792 2793
	result = register_blkdev(nvme_major, "nvme");
	if (result < 0)
K
Keith Busch 已提交
2794
		goto kill_workq;
2795
	else if (result > 0)
2796
		nvme_major = result;
M
Matthew Wilcox 已提交
2797

2798 2799 2800 2801 2802 2803 2804 2805
	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
							&nvme_dev_fops);
	if (result < 0)
		goto unregister_blkdev;
	else if (result > 0)
		nvme_char_major = result;

	nvme_class = class_create(THIS_MODULE, "nvme");
2806 2807
	if (IS_ERR(nvme_class)) {
		result = PTR_ERR(nvme_class);
2808
		goto unregister_chrdev;
2809
	}
2810

2811 2812
	result = pci_register_driver(&nvme_driver);
	if (result)
2813
		goto destroy_class;
2814
	return 0;
M
Matthew Wilcox 已提交
2815

2816 2817 2818 2819
 destroy_class:
	class_destroy(nvme_class);
 unregister_chrdev:
	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2820
 unregister_blkdev:
M
Matthew Wilcox 已提交
2821
	unregister_blkdev(nvme_major, "nvme");
K
Keith Busch 已提交
2822 2823
 kill_workq:
	destroy_workqueue(nvme_workq);
M
Matthew Wilcox 已提交
2824 2825 2826 2827 2828 2829 2830
	return result;
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
	unregister_blkdev(nvme_major, "nvme");
K
Keith Busch 已提交
2831
	destroy_workqueue(nvme_workq);
2832 2833
	class_destroy(nvme_class);
	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
2834
	BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
2835
	_nvme_check_size();
M
Matthew Wilcox 已提交
2836 2837 2838 2839
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
2840
MODULE_VERSION("1.0");
M
Matthew Wilcox 已提交
2841 2842
module_init(nvme_init);
module_exit(nvme_exit);