nvme-core.c 64.7 KB
Newer Older
M
Matthew Wilcox 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 * NVM Express device driver
 * Copyright (c) 2011, Intel Corporation.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License along with
 * this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include <linux/nvme.h>
#include <linux/bio.h>
21
#include <linux/bitops.h>
M
Matthew Wilcox 已提交
22
#include <linux/blkdev.h>
23
#include <linux/delay.h>
M
Matthew Wilcox 已提交
24 25 26
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/genhd.h>
27
#include <linux/idr.h>
M
Matthew Wilcox 已提交
28 29 30 31
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kdev_t.h>
32
#include <linux/kthread.h>
M
Matthew Wilcox 已提交
33 34 35 36 37
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/pci.h>
38
#include <linux/poison.h>
39
#include <linux/ptrace.h>
M
Matthew Wilcox 已提交
40 41 42
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/types.h>
V
Vishal Verma 已提交
43
#include <scsi/sg.h>
44 45
#include <asm-generic/io-64-nonatomic-lo-hi.h>

M
Matthew Wilcox 已提交
46 47 48
#define NVME_Q_DEPTH 1024
#define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
#define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
49
#define ADMIN_TIMEOUT	(60 * HZ)
M
Matthew Wilcox 已提交
50 51 52 53

static int nvme_major;
module_param(nvme_major, int, 0);

54 55 56
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0);

57 58 59
static DEFINE_SPINLOCK(dev_list_lock);
static LIST_HEAD(dev_list);
static struct task_struct *nvme_thread;
K
Keith Busch 已提交
60
static struct workqueue_struct *nvme_workq;
61

62 63
static void nvme_reset_failed_dev(struct work_struct *ws);

K
Keith Busch 已提交
64 65 66 67 68 69 70
struct async_cmd_info {
	struct kthread_work work;
	struct kthread_worker *worker;
	u32 result;
	int status;
	void *ctx;
};
71

M
Matthew Wilcox 已提交
72 73 74 75 76
/*
 * An NVM Express queue.  Each device has at least two (one for admin
 * commands and one for I/O commands).
 */
struct nvme_queue {
77
	struct rcu_head r_head;
M
Matthew Wilcox 已提交
78
	struct device *q_dmadev;
M
Matthew Wilcox 已提交
79
	struct nvme_dev *dev;
80
	char irqname[24];	/* nvme4294967295-65535\0 */
M
Matthew Wilcox 已提交
81 82 83 84 85 86
	spinlock_t q_lock;
	struct nvme_command *sq_cmds;
	volatile struct nvme_completion *cqes;
	dma_addr_t sq_dma_addr;
	dma_addr_t cq_dma_addr;
	wait_queue_head_t sq_full;
87
	wait_queue_t sq_cong_wait;
M
Matthew Wilcox 已提交
88 89 90 91 92 93 94
	struct bio_list sq_cong;
	u32 __iomem *q_db;
	u16 q_depth;
	u16 cq_vector;
	u16 sq_head;
	u16 sq_tail;
	u16 cq_head;
K
Keith Busch 已提交
95
	u16 qid;
96 97
	u8 cq_phase;
	u8 cqe_seen;
98
	u8 q_suspended;
K
Keith Busch 已提交
99
	struct async_cmd_info cmdinfo;
M
Matthew Wilcox 已提交
100 101 102 103 104 105 106 107 108 109 110 111 112
	unsigned long cmdid_data[];
};

/*
 * Check we didin't inadvertently grow the command struct
 */
static inline void _nvme_check_size(void)
{
	BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
113
	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
K
Keith Busch 已提交
114
	BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
M
Matthew Wilcox 已提交
115 116 117 118
	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
K
Keith Busch 已提交
119
	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
M
Matthew Wilcox 已提交
120 121
}

122
typedef void (*nvme_completion_fn)(struct nvme_dev *, void *,
123 124
						struct nvme_completion *);

125
struct nvme_cmd_info {
126 127
	nvme_completion_fn fn;
	void *ctx;
128
	unsigned long timeout;
K
Keith Busch 已提交
129
	int aborted;
130 131 132 133 134 135 136
};

static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq)
{
	return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)];
}

137 138 139 140 141
static unsigned nvme_queue_extra(int depth)
{
	return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info));
}

M
Matthew Wilcox 已提交
142
/**
143 144 145
 * alloc_cmdid() - Allocate a Command ID
 * @nvmeq: The queue that will be used for this command
 * @ctx: A pointer that will be passed to the handler
146
 * @handler: The function to call on completion
M
Matthew Wilcox 已提交
147 148 149 150 151 152
 *
 * Allocate a Command ID for a queue.  The data passed in will
 * be passed to the completion handler.  This is implemented by using
 * the bottom two bits of the ctx pointer to store the handler ID.
 * Passing in a pointer that's not 4-byte aligned will cause a BUG.
 * We can change this if it becomes a problem.
153 154 155
 *
 * May be called with local interrupts disabled and the q_lock held,
 * or with interrupts enabled and no locks held.
M
Matthew Wilcox 已提交
156
 */
157 158
static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx,
				nvme_completion_fn handler, unsigned timeout)
M
Matthew Wilcox 已提交
159
{
160
	int depth = nvmeq->q_depth - 1;
161
	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
M
Matthew Wilcox 已提交
162 163 164 165 166 167 168 169
	int cmdid;

	do {
		cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth);
		if (cmdid >= depth)
			return -EBUSY;
	} while (test_and_set_bit(cmdid, nvmeq->cmdid_data));

170 171
	info[cmdid].fn = handler;
	info[cmdid].ctx = ctx;
172
	info[cmdid].timeout = jiffies + timeout;
K
Keith Busch 已提交
173
	info[cmdid].aborted = 0;
M
Matthew Wilcox 已提交
174 175 176 177
	return cmdid;
}

static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx,
178
				nvme_completion_fn handler, unsigned timeout)
M
Matthew Wilcox 已提交
179 180 181
{
	int cmdid;
	wait_event_killable(nvmeq->sq_full,
182
		(cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0);
M
Matthew Wilcox 已提交
183 184 185
	return (cmdid < 0) ? -EINTR : cmdid;
}

186 187
/* Special values must be less than 0x1000 */
#define CMD_CTX_BASE		((void *)POISON_POINTER_DELTA)
188 189 190
#define CMD_CTX_CANCELLED	(0x30C + CMD_CTX_BASE)
#define CMD_CTX_COMPLETED	(0x310 + CMD_CTX_BASE)
#define CMD_CTX_INVALID		(0x314 + CMD_CTX_BASE)
M
Matthew Wilcox 已提交
191
#define CMD_CTX_FLUSH		(0x318 + CMD_CTX_BASE)
K
Keith Busch 已提交
192
#define CMD_CTX_ABORT		(0x31C + CMD_CTX_BASE)
193

194
static void special_completion(struct nvme_dev *dev, void *ctx,
195 196 197 198 199 200
						struct nvme_completion *cqe)
{
	if (ctx == CMD_CTX_CANCELLED)
		return;
	if (ctx == CMD_CTX_FLUSH)
		return;
K
Keith Busch 已提交
201 202 203 204
	if (ctx == CMD_CTX_ABORT) {
		++dev->abort_limit;
		return;
	}
205
	if (ctx == CMD_CTX_COMPLETED) {
206
		dev_warn(&dev->pci_dev->dev,
207 208 209 210 211
				"completed id %d twice on queue %d\n",
				cqe->command_id, le16_to_cpup(&cqe->sq_id));
		return;
	}
	if (ctx == CMD_CTX_INVALID) {
212
		dev_warn(&dev->pci_dev->dev,
213 214 215 216 217
				"invalid id %d completed on queue %d\n",
				cqe->command_id, le16_to_cpup(&cqe->sq_id));
		return;
	}

218
	dev_warn(&dev->pci_dev->dev, "Unknown special completion %p\n", ctx);
219 220
}

K
Keith Busch 已提交
221 222 223 224 225 226 227 228 229
static void async_completion(struct nvme_dev *dev, void *ctx,
						struct nvme_completion *cqe)
{
	struct async_cmd_info *cmdinfo = ctx;
	cmdinfo->result = le32_to_cpup(&cqe->result);
	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
}

230 231 232
/*
 * Called with local interrupts disabled and the q_lock held.  May not sleep.
 */
233 234
static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid,
						nvme_completion_fn *fn)
M
Matthew Wilcox 已提交
235
{
236
	void *ctx;
237
	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
M
Matthew Wilcox 已提交
238

239 240
	if (cmdid >= nvmeq->q_depth) {
		*fn = special_completion;
241
		return CMD_CTX_INVALID;
242
	}
243 244
	if (fn)
		*fn = info[cmdid].fn;
245 246
	ctx = info[cmdid].ctx;
	info[cmdid].fn = special_completion;
247
	info[cmdid].ctx = CMD_CTX_COMPLETED;
M
Matthew Wilcox 已提交
248 249
	clear_bit(cmdid, nvmeq->cmdid_data);
	wake_up(&nvmeq->sq_full);
250
	return ctx;
M
Matthew Wilcox 已提交
251 252
}

253 254
static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
						nvme_completion_fn *fn)
255
{
256
	void *ctx;
257
	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
258 259 260 261
	if (fn)
		*fn = info[cmdid].fn;
	ctx = info[cmdid].ctx;
	info[cmdid].fn = special_completion;
262
	info[cmdid].ctx = CMD_CTX_CANCELLED;
263
	return ctx;
264 265
}

266
static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid)
M
Matthew Wilcox 已提交
267
{
268
	return rcu_dereference_raw(dev->queues[qid]);
M
Matthew Wilcox 已提交
269 270
}

271 272 273 274 275 276 277
struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU)
{
	rcu_read_lock();
	return rcu_dereference(dev->queues[get_cpu() + 1]);
}

void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU)
M
Matthew Wilcox 已提交
278
{
M
Matthew Wilcox 已提交
279
	put_cpu();
280
	rcu_read_unlock();
M
Matthew Wilcox 已提交
281 282 283
}

/**
284
 * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
M
Matthew Wilcox 已提交
285 286 287 288 289 290 291 292 293 294 295 296 297 298
 * @nvmeq: The queue to use
 * @cmd: The command to send
 *
 * Safe to use from interrupt context
 */
static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
{
	unsigned long flags;
	u16 tail;
	spin_lock_irqsave(&nvmeq->q_lock, flags);
	tail = nvmeq->sq_tail;
	memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
	if (++tail == nvmeq->q_depth)
		tail = 0;
299
	writel(tail, nvmeq->q_db);
M
Matthew Wilcox 已提交
300 301 302 303 304 305
	nvmeq->sq_tail = tail;
	spin_unlock_irqrestore(&nvmeq->q_lock, flags);

	return 0;
}

306
static __le64 **iod_list(struct nvme_iod *iod)
307
{
308
	return ((void *)iod) + iod->offset;
309 310
}

311 312 313 314 315 316 317 318 319 320
/*
 * Will slightly overestimate the number of pages needed.  This is OK
 * as it only leads to a small amount of wasted memory for the lifetime of
 * the I/O.
 */
static int nvme_npages(unsigned size)
{
	unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE);
	return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
M
Matthew Wilcox 已提交
321

322 323
static struct nvme_iod *
nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
M
Matthew Wilcox 已提交
324
{
325 326 327 328 329 330 331 332
	struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
				sizeof(__le64 *) * nvme_npages(nbytes) +
				sizeof(struct scatterlist) * nseg, gfp);

	if (iod) {
		iod->offset = offsetof(struct nvme_iod, sg[nseg]);
		iod->npages = -1;
		iod->length = nbytes;
K
Keith Busch 已提交
333
		iod->nents = 0;
K
Keith Busch 已提交
334
		iod->start_time = jiffies;
335 336 337
	}

	return iod;
M
Matthew Wilcox 已提交
338 339
}

V
Vishal Verma 已提交
340
void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
M
Matthew Wilcox 已提交
341
{
342 343 344 345 346 347 348 349 350 351 352 353 354 355
	const int last_prp = PAGE_SIZE / 8 - 1;
	int i;
	__le64 **list = iod_list(iod);
	dma_addr_t prp_dma = iod->first_dma;

	if (iod->npages == 0)
		dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
	for (i = 0; i < iod->npages; i++) {
		__le64 *prp_list = list[i];
		dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
		dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
		prp_dma = next_prp_dma;
	}
	kfree(iod);
M
Matthew Wilcox 已提交
356 357
}

K
Keith Busch 已提交
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
static void nvme_start_io_acct(struct bio *bio)
{
	struct gendisk *disk = bio->bi_bdev->bd_disk;
	const int rw = bio_data_dir(bio);
	int cpu = part_stat_lock();
	part_round_stats(cpu, &disk->part0);
	part_stat_inc(cpu, &disk->part0, ios[rw]);
	part_stat_add(cpu, &disk->part0, sectors[rw], bio_sectors(bio));
	part_inc_in_flight(&disk->part0, rw);
	part_stat_unlock();
}

static void nvme_end_io_acct(struct bio *bio, unsigned long start_time)
{
	struct gendisk *disk = bio->bi_bdev->bd_disk;
	const int rw = bio_data_dir(bio);
	unsigned long duration = jiffies - start_time;
	int cpu = part_stat_lock();
	part_stat_add(cpu, &disk->part0, ticks[rw], duration);
	part_round_stats(cpu, &disk->part0);
	part_dec_in_flight(&disk->part0, rw);
	part_stat_unlock();
}

382
static void bio_completion(struct nvme_dev *dev, void *ctx,
M
Matthew Wilcox 已提交
383 384
						struct nvme_completion *cqe)
{
385 386
	struct nvme_iod *iod = ctx;
	struct bio *bio = iod->private;
M
Matthew Wilcox 已提交
387 388
	u16 status = le16_to_cpup(&cqe->status) >> 1;

389
	if (iod->nents) {
K
Keith Busch 已提交
390
		dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
M
Matthew Wilcox 已提交
391
			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
392 393
		nvme_end_io_acct(bio, iod->start_time);
	}
394
	nvme_free_iod(dev, iod);
395
	if (status)
396
		bio_endio(bio, -EIO);
397
	else
398
		bio_endio(bio, 0);
M
Matthew Wilcox 已提交
399 400
}

401
/* length is in bytes.  gfp flags indicates whether we may sleep. */
V
Vishal Verma 已提交
402 403
int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
			struct nvme_iod *iod, int total_len, gfp_t gfp)
M
Matthew Wilcox 已提交
404
{
405
	struct dma_pool *pool;
406 407
	int length = total_len;
	struct scatterlist *sg = iod->sg;
M
Matthew Wilcox 已提交
408 409 410
	int dma_len = sg_dma_len(sg);
	u64 dma_addr = sg_dma_address(sg);
	int offset = offset_in_page(dma_addr);
411
	__le64 *prp_list;
412
	__le64 **list = iod_list(iod);
413
	dma_addr_t prp_dma;
414
	int nprps, i;
M
Matthew Wilcox 已提交
415 416 417 418

	cmd->prp1 = cpu_to_le64(dma_addr);
	length -= (PAGE_SIZE - offset);
	if (length <= 0)
419
		return total_len;
M
Matthew Wilcox 已提交
420 421 422 423 424 425 426 427 428 429 430 431

	dma_len -= (PAGE_SIZE - offset);
	if (dma_len) {
		dma_addr += (PAGE_SIZE - offset);
	} else {
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
	}

	if (length <= PAGE_SIZE) {
		cmd->prp2 = cpu_to_le64(dma_addr);
432
		return total_len;
433 434 435
	}

	nprps = DIV_ROUND_UP(length, PAGE_SIZE);
436 437
	if (nprps <= (256 / 8)) {
		pool = dev->prp_small_pool;
438
		iod->npages = 0;
439 440
	} else {
		pool = dev->prp_page_pool;
441
		iod->npages = 1;
442 443
	}

444 445 446
	prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
	if (!prp_list) {
		cmd->prp2 = cpu_to_le64(dma_addr);
447 448
		iod->npages = -1;
		return (total_len - length) + PAGE_SIZE;
449
	}
450 451
	list[0] = prp_list;
	iod->first_dma = prp_dma;
452 453 454
	cmd->prp2 = cpu_to_le64(prp_dma);
	i = 0;
	for (;;) {
455
		if (i == PAGE_SIZE / 8) {
456
			__le64 *old_prp_list = prp_list;
457
			prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
458 459 460
			if (!prp_list)
				return total_len - length;
			list[iod->npages++] = prp_list;
461 462 463
			prp_list[0] = old_prp_list[i - 1];
			old_prp_list[i - 1] = cpu_to_le64(prp_dma);
			i = 1;
464 465 466 467 468 469 470 471 472 473 474 475 476
		}
		prp_list[i++] = cpu_to_le64(dma_addr);
		dma_len -= PAGE_SIZE;
		dma_addr += PAGE_SIZE;
		length -= PAGE_SIZE;
		if (length <= 0)
			break;
		if (dma_len > 0)
			continue;
		BUG_ON(dma_len < 0);
		sg = sg_next(sg);
		dma_addr = sg_dma_address(sg);
		dma_len = sg_dma_len(sg);
M
Matthew Wilcox 已提交
477 478
	}

479
	return total_len;
M
Matthew Wilcox 已提交
480 481
}

482
static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
K
Kent Overstreet 已提交
483
				 int len)
484
{
K
Kent Overstreet 已提交
485 486
	struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
	if (!split)
487 488
		return -ENOMEM;

K
Kent Overstreet 已提交
489 490
	bio_chain(split, bio);

491 492
	if (bio_list_empty(&nvmeq->sq_cong))
		add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
K
Kent Overstreet 已提交
493 494
	bio_list_add(&nvmeq->sq_cong, split);
	bio_list_add(&nvmeq->sq_cong, bio);
495 496 497 498

	return 0;
}

499 500 501 502
/* NVMe scatterlists require no holes in the virtual address */
#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2)	((vec2)->bv_offset || \
			(((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))

503
static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
M
Matthew Wilcox 已提交
504 505
		struct bio *bio, enum dma_data_direction dma_dir, int psegs)
{
506 507
	struct bio_vec bvec, bvprv;
	struct bvec_iter iter;
508
	struct scatterlist *sg = NULL;
509 510
	int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
	int first = 1;
511 512 513

	if (nvmeq->dev->stripe_size)
		split_len = nvmeq->dev->stripe_size -
514 515
			((bio->bi_iter.bi_sector << 9) &
			 (nvmeq->dev->stripe_size - 1));
M
Matthew Wilcox 已提交
516

517
	sg_init_table(iod->sg, psegs);
518 519 520
	bio_for_each_segment(bvec, bio, iter) {
		if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
			sg->length += bvec.bv_len;
521
		} else {
522 523
			if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
				return nvme_split_and_submit(bio, nvmeq,
K
Kent Overstreet 已提交
524
							     length);
525

526
			sg = sg ? sg + 1 : iod->sg;
527 528
			sg_set_page(sg, bvec.bv_page,
				    bvec.bv_len, bvec.bv_offset);
529 530
			nsegs++;
		}
531

532
		if (split_len - length < bvec.bv_len)
K
Kent Overstreet 已提交
533
			return nvme_split_and_submit(bio, nvmeq, split_len);
534
		length += bvec.bv_len;
535
		bvprv = bvec;
536
		first = 0;
M
Matthew Wilcox 已提交
537
	}
538
	iod->nents = nsegs;
539
	sg_mark_end(sg);
540
	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
541
		return -ENOMEM;
542

543
	BUG_ON(length != bio->bi_iter.bi_size);
544
	return length;
M
Matthew Wilcox 已提交
545 546
}

547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566
/*
 * We reuse the small pool to allocate the 16-byte range here as it is not
 * worth having a special pool for these or additional cases to handle freeing
 * the iod.
 */
static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
		struct bio *bio, struct nvme_iod *iod, int cmdid)
{
	struct nvme_dsm_range *range;
	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];

	range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
							&iod->first_dma);
	if (!range)
		return -ENOMEM;

	iod_list(iod)[0] = (__le64 *)range;
	iod->npages = 0;

	range->cattr = cpu_to_le32(0);
567 568
	range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift);
	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584

	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->dsm.opcode = nvme_cmd_dsm;
	cmnd->dsm.command_id = cmdid;
	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
	cmnd->dsm.nr = 0;
	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);

	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
	writel(nvmeq->sq_tail, nvmeq->q_db);

	return 0;
}

M
Matthew Wilcox 已提交
585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601
static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
								int cmdid)
{
	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];

	memset(cmnd, 0, sizeof(*cmnd));
	cmnd->common.opcode = nvme_cmd_flush;
	cmnd->common.command_id = cmdid;
	cmnd->common.nsid = cpu_to_le32(ns->ns_id);

	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
	writel(nvmeq->sq_tail, nvmeq->q_db);

	return 0;
}

V
Vishal Verma 已提交
602
int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
M
Matthew Wilcox 已提交
603 604
{
	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
605
					special_completion, NVME_IO_TIMEOUT);
M
Matthew Wilcox 已提交
606 607 608 609 610 611
	if (unlikely(cmdid < 0))
		return cmdid;

	return nvme_submit_flush(nvmeq, ns, cmdid);
}

612 613 614
/*
 * Called with local interrupts disabled and the q_lock held.  May not sleep.
 */
M
Matthew Wilcox 已提交
615 616 617
static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
								struct bio *bio)
{
M
Matthew Wilcox 已提交
618
	struct nvme_command *cmnd;
619
	struct nvme_iod *iod;
M
Matthew Wilcox 已提交
620
	enum dma_data_direction dma_dir;
621
	int cmdid, length, result;
M
Matthew Wilcox 已提交
622 623 624 625
	u16 control;
	u32 dsmgmt;
	int psegs = bio_phys_segments(ns->queue, bio);

M
Matthew Wilcox 已提交
626 627 628 629 630 631
	if ((bio->bi_rw & REQ_FLUSH) && psegs) {
		result = nvme_submit_flush_data(nvmeq, ns);
		if (result)
			return result;
	}

632
	result = -ENOMEM;
633
	iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC);
634
	if (!iod)
635
		goto nomem;
636
	iod->private = bio;
M
Matthew Wilcox 已提交
637

638
	result = -EBUSY;
639
	cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT);
M
Matthew Wilcox 已提交
640
	if (unlikely(cmdid < 0))
641
		goto free_iod;
M
Matthew Wilcox 已提交
642

643 644 645 646 647 648
	if (bio->bi_rw & REQ_DISCARD) {
		result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
		if (result)
			goto free_cmdid;
		return result;
	}
M
Matthew Wilcox 已提交
649 650 651
	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
		return nvme_submit_flush(nvmeq, ns, cmdid);

M
Matthew Wilcox 已提交
652 653 654 655 656 657 658 659 660 661
	control = 0;
	if (bio->bi_rw & REQ_FUA)
		control |= NVME_RW_FUA;
	if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD))
		control |= NVME_RW_LR;

	dsmgmt = 0;
	if (bio->bi_rw & REQ_RAHEAD)
		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;

M
Matthew Wilcox 已提交
662
	cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
M
Matthew Wilcox 已提交
663

664
	memset(cmnd, 0, sizeof(*cmnd));
M
Matthew Wilcox 已提交
665
	if (bio_data_dir(bio)) {
M
Matthew Wilcox 已提交
666
		cmnd->rw.opcode = nvme_cmd_write;
M
Matthew Wilcox 已提交
667 668
		dma_dir = DMA_TO_DEVICE;
	} else {
M
Matthew Wilcox 已提交
669
		cmnd->rw.opcode = nvme_cmd_read;
M
Matthew Wilcox 已提交
670 671 672
		dma_dir = DMA_FROM_DEVICE;
	}

673 674
	result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
	if (result <= 0)
675
		goto free_cmdid;
676
	length = result;
M
Matthew Wilcox 已提交
677

M
Matthew Wilcox 已提交
678 679
	cmnd->rw.command_id = cmdid;
	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
680 681
	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
								GFP_ATOMIC);
682
	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
683
	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
M
Matthew Wilcox 已提交
684 685
	cmnd->rw.control = cpu_to_le16(control);
	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
M
Matthew Wilcox 已提交
686

K
Keith Busch 已提交
687
	nvme_start_io_acct(bio);
M
Matthew Wilcox 已提交
688 689
	if (++nvmeq->sq_tail == nvmeq->q_depth)
		nvmeq->sq_tail = 0;
690
	writel(nvmeq->sq_tail, nvmeq->q_db);
M
Matthew Wilcox 已提交
691

692 693
	return 0;

694 695
 free_cmdid:
	free_cmdid(nvmeq, cmdid, NULL);
696 697
 free_iod:
	nvme_free_iod(nvmeq->dev, iod);
698 699
 nomem:
	return result;
M
Matthew Wilcox 已提交
700 701
}

702
static int nvme_process_cq(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
703
{
M
Matthew Wilcox 已提交
704
	u16 head, phase;
M
Matthew Wilcox 已提交
705 706

	head = nvmeq->cq_head;
M
Matthew Wilcox 已提交
707
	phase = nvmeq->cq_phase;
M
Matthew Wilcox 已提交
708 709

	for (;;) {
710 711
		void *ctx;
		nvme_completion_fn fn;
M
Matthew Wilcox 已提交
712
		struct nvme_completion cqe = nvmeq->cqes[head];
M
Matthew Wilcox 已提交
713
		if ((le16_to_cpu(cqe.status) & 1) != phase)
M
Matthew Wilcox 已提交
714 715 716 717
			break;
		nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
		if (++head == nvmeq->q_depth) {
			head = 0;
M
Matthew Wilcox 已提交
718
			phase = !phase;
M
Matthew Wilcox 已提交
719 720
		}

721
		ctx = free_cmdid(nvmeq, cqe.command_id, &fn);
722
		fn(nvmeq->dev, ctx, &cqe);
M
Matthew Wilcox 已提交
723 724 725 726 727 728 729 730
	}

	/* If the controller ignores the cq head doorbell and continuously
	 * writes to the queue, it is theoretically possible to wrap around
	 * the queue twice and mistakenly return IRQ_NONE.  Linux only
	 * requires that 0.1% of your interrupts are handled, so this isn't
	 * a big problem.
	 */
M
Matthew Wilcox 已提交
731
	if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
732
		return 0;
M
Matthew Wilcox 已提交
733

734
	writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
M
Matthew Wilcox 已提交
735
	nvmeq->cq_head = head;
M
Matthew Wilcox 已提交
736
	nvmeq->cq_phase = phase;
M
Matthew Wilcox 已提交
737

738 739
	nvmeq->cqe_seen = 1;
	return 1;
M
Matthew Wilcox 已提交
740 741
}

742 743 744 745 746 747
static void nvme_make_request(struct request_queue *q, struct bio *bio)
{
	struct nvme_ns *ns = q->queuedata;
	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
	int result = -EBUSY;

748 749 750 751 752 753
	if (!nvmeq) {
		put_nvmeq(NULL);
		bio_endio(bio, -EIO);
		return;
	}

754
	spin_lock_irq(&nvmeq->q_lock);
755
	if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong))
756 757 758 759 760 761 762 763 764 765 766 767
		result = nvme_submit_bio_queue(nvmeq, ns, bio);
	if (unlikely(result)) {
		if (bio_list_empty(&nvmeq->sq_cong))
			add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
		bio_list_add(&nvmeq->sq_cong, bio);
	}

	nvme_process_cq(nvmeq);
	spin_unlock_irq(&nvmeq->q_lock);
	put_nvmeq(nvmeq);
}

M
Matthew Wilcox 已提交
768
static irqreturn_t nvme_irq(int irq, void *data)
769 770 771 772
{
	irqreturn_t result;
	struct nvme_queue *nvmeq = data;
	spin_lock(&nvmeq->q_lock);
773 774 775
	nvme_process_cq(nvmeq);
	result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
	nvmeq->cqe_seen = 0;
776 777 778 779 780 781 782 783 784 785 786 787 788
	spin_unlock(&nvmeq->q_lock);
	return result;
}

static irqreturn_t nvme_irq_check(int irq, void *data)
{
	struct nvme_queue *nvmeq = data;
	struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
	if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
		return IRQ_NONE;
	return IRQ_WAKE_THREAD;
}

789 790 791
static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid)
{
	spin_lock_irq(&nvmeq->q_lock);
792
	cancel_cmdid(nvmeq, cmdid, NULL);
793 794 795
	spin_unlock_irq(&nvmeq->q_lock);
}

796 797 798 799 800 801
struct sync_cmd_info {
	struct task_struct *task;
	u32 result;
	int status;
};

802
static void sync_completion(struct nvme_dev *dev, void *ctx,
803 804 805 806 807 808 809 810
						struct nvme_completion *cqe)
{
	struct sync_cmd_info *cmdinfo = ctx;
	cmdinfo->result = le32_to_cpup(&cqe->result);
	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
	wake_up_process(cmdinfo->task);
}

M
Matthew Wilcox 已提交
811 812 813 814
/*
 * Returns 0 on success.  If the result is negative, it's a Linux error code;
 * if the result is positive, it's an NVM Express status code
 */
V
Vishal Verma 已提交
815 816
int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
						u32 *result, unsigned timeout)
M
Matthew Wilcox 已提交
817 818 819 820 821 822 823
{
	int cmdid;
	struct sync_cmd_info cmdinfo;

	cmdinfo.task = current;
	cmdinfo.status = -EINTR;

824
	cmdid = alloc_cmdid_killable(nvmeq, &cmdinfo, sync_completion,
825
								timeout);
M
Matthew Wilcox 已提交
826 827 828 829
	if (cmdid < 0)
		return cmdid;
	cmd->common.command_id = cmdid;

830 831
	set_current_state(TASK_KILLABLE);
	nvme_submit_cmd(nvmeq, cmd);
832
	schedule_timeout(timeout);
M
Matthew Wilcox 已提交
833

834 835 836 837 838
	if (cmdinfo.status == -EINTR) {
		nvme_abort_command(nvmeq, cmdid);
		return -EINTR;
	}

M
Matthew Wilcox 已提交
839 840 841 842 843 844
	if (result)
		*result = cmdinfo.result;

	return cmdinfo.status;
}

K
Keith Busch 已提交
845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
static int nvme_submit_async_cmd(struct nvme_queue *nvmeq,
			struct nvme_command *cmd,
			struct async_cmd_info *cmdinfo, unsigned timeout)
{
	int cmdid;

	cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout);
	if (cmdid < 0)
		return cmdid;
	cmdinfo->status = -EINTR;
	cmd->common.command_id = cmdid;
	nvme_submit_cmd(nvmeq, cmd);
	return 0;
}

V
Vishal Verma 已提交
860
int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
M
Matthew Wilcox 已提交
861 862
								u32 *result)
{
863 864
	return nvme_submit_sync_cmd(raw_nvmeq(dev, 0), cmd, result,
								ADMIN_TIMEOUT);
M
Matthew Wilcox 已提交
865 866
}

K
Keith Busch 已提交
867 868 869
static int nvme_submit_admin_cmd_async(struct nvme_dev *dev,
		struct nvme_command *cmd, struct async_cmd_info *cmdinfo)
{
870
	return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo,
K
Keith Busch 已提交
871 872 873
								ADMIN_TIMEOUT);
}

M
Matthew Wilcox 已提交
874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940
static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
{
	int status;
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(id);

	status = nvme_submit_admin_cmd(dev, &c, NULL);
	if (status)
		return -EIO;
	return 0;
}

static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	int status;
	struct nvme_command c;
	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;

	memset(&c, 0, sizeof(c));
	c.create_cq.opcode = nvme_admin_create_cq;
	c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
	c.create_cq.cqid = cpu_to_le16(qid);
	c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_cq.cq_flags = cpu_to_le16(flags);
	c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);

	status = nvme_submit_admin_cmd(dev, &c, NULL);
	if (status)
		return -EIO;
	return 0;
}

static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
						struct nvme_queue *nvmeq)
{
	int status;
	struct nvme_command c;
	int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;

	memset(&c, 0, sizeof(c));
	c.create_sq.opcode = nvme_admin_create_sq;
	c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
	c.create_sq.sqid = cpu_to_le16(qid);
	c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
	c.create_sq.sq_flags = cpu_to_le16(flags);
	c.create_sq.cqid = cpu_to_le16(qid);

	status = nvme_submit_admin_cmd(dev, &c, NULL);
	if (status)
		return -EIO;
	return 0;
}

static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
}

static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
{
	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
}

V
Vishal Verma 已提交
941
int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
942 943 944 945 946 947 948 949 950 951 952 953 954
							dma_addr_t dma_addr)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.identify.opcode = nvme_admin_identify;
	c.identify.nsid = cpu_to_le32(nsid);
	c.identify.prp1 = cpu_to_le64(dma_addr);
	c.identify.cns = cpu_to_le32(cns);

	return nvme_submit_admin_cmd(dev, &c, NULL);
}

V
Vishal Verma 已提交
955
int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
956
					dma_addr_t dma_addr, u32 *result)
957 958 959 960 961
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.features.opcode = nvme_admin_get_features;
962
	c.features.nsid = cpu_to_le32(nsid);
963 964 965
	c.features.prp1 = cpu_to_le64(dma_addr);
	c.features.fid = cpu_to_le32(fid);

966
	return nvme_submit_admin_cmd(dev, &c, result);
967 968
}

V
Vishal Verma 已提交
969 970
int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
					dma_addr_t dma_addr, u32 *result)
971 972 973 974 975 976 977 978 979
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.features.opcode = nvme_admin_set_features;
	c.features.prp1 = cpu_to_le64(dma_addr);
	c.features.fid = cpu_to_le32(fid);
	c.features.dword11 = cpu_to_le32(dword11);

980 981 982
	return nvme_submit_admin_cmd(dev, &c, result);
}

K
Keith Busch 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996
/**
 * nvme_abort_cmd - Attempt aborting a command
 * @cmdid: Command id of a timed out IO
 * @queue: The queue with timed out IO
 *
 * Schedule controller reset if the command was already aborted once before and
 * still hasn't been returned to the driver, or if this is the admin queue.
 */
static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq)
{
	int a_cmdid;
	struct nvme_command cmd;
	struct nvme_dev *dev = nvmeq->dev;
	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
997
	struct nvme_queue *adminq;
K
Keith Busch 已提交
998 999 1000 1001 1002 1003 1004 1005

	if (!nvmeq->qid || info[cmdid].aborted) {
		if (work_busy(&dev->reset_work))
			return;
		list_del_init(&dev->node);
		dev_warn(&dev->pci_dev->dev,
			"I/O %d QID %d timeout, reset controller\n", cmdid,
								nvmeq->qid);
M
Matthew Wilcox 已提交
1006
		PREPARE_WORK(&dev->reset_work, nvme_reset_failed_dev);
K
Keith Busch 已提交
1007 1008 1009 1010 1011 1012 1013
		queue_work(nvme_workq, &dev->reset_work);
		return;
	}

	if (!dev->abort_limit)
		return;

1014 1015
	adminq = rcu_dereference(dev->queues[0]);
	a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion,
K
Keith Busch 已提交
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031
								ADMIN_TIMEOUT);
	if (a_cmdid < 0)
		return;

	memset(&cmd, 0, sizeof(cmd));
	cmd.abort.opcode = nvme_admin_abort_cmd;
	cmd.abort.cid = cmdid;
	cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
	cmd.abort.command_id = a_cmdid;

	--dev->abort_limit;
	info[cmdid].aborted = 1;
	info[cmdid].timeout = jiffies + ADMIN_TIMEOUT;

	dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid,
							nvmeq->qid);
1032
	nvme_submit_cmd(adminq, &cmd);
K
Keith Busch 已提交
1033 1034
}

1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050
/**
 * nvme_cancel_ios - Cancel outstanding I/Os
 * @queue: The queue to cancel I/Os on
 * @timeout: True to only cancel I/Os which have timed out
 */
static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout)
{
	int depth = nvmeq->q_depth - 1;
	struct nvme_cmd_info *info = nvme_cmd_info(nvmeq);
	unsigned long now = jiffies;
	int cmdid;

	for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) {
		void *ctx;
		nvme_completion_fn fn;
		static struct nvme_completion cqe = {
1051
			.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1),
1052 1053 1054 1055
		};

		if (timeout && !time_after(now, info[cmdid].timeout))
			continue;
1056 1057
		if (info[cmdid].ctx == CMD_CTX_CANCELLED)
			continue;
K
Keith Busch 已提交
1058 1059 1060 1061 1062 1063
		if (timeout && nvmeq->dev->initialized) {
			nvme_abort_cmd(cmdid, nvmeq);
			continue;
		}
		dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid,
								nvmeq->qid);
1064 1065 1066 1067 1068
		ctx = cancel_cmdid(nvmeq, cmdid, &fn);
		fn(nvmeq->dev, ctx, &cqe);
	}
}

1069
static void nvme_free_queue(struct rcu_head *r)
1070
{
1071 1072
	struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head);

1073 1074 1075 1076 1077 1078 1079
	spin_lock_irq(&nvmeq->q_lock);
	while (bio_list_peek(&nvmeq->sq_cong)) {
		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
		bio_endio(bio, -EIO);
	}
	spin_unlock_irq(&nvmeq->q_lock);

1080 1081 1082 1083 1084 1085 1086
	dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
				(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
	dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
					nvmeq->sq_cmds, nvmeq->sq_dma_addr);
	kfree(nvmeq);
}

1087
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
1088 1089 1090
{
	int i;

1091 1092
	for (i = num_possible_cpus(); i > dev->queue_count - 1; i--)
		rcu_assign_pointer(dev->queues[i], NULL);
1093
	for (i = dev->queue_count - 1; i >= lowest; i--) {
1094 1095 1096
		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
		rcu_assign_pointer(dev->queues[i], NULL);
		call_rcu(&nvmeq->r_head, nvme_free_queue);
1097 1098 1099 1100
		dev->queue_count--;
	}
}

K
Keith Busch 已提交
1101 1102 1103 1104 1105 1106 1107
/**
 * nvme_suspend_queue - put queue into suspended state
 * @nvmeq - queue to suspend
 *
 * Returns 1 if already suspended, 0 otherwise.
 */
static int nvme_suspend_queue(struct nvme_queue *nvmeq)
M
Matthew Wilcox 已提交
1108
{
K
Keith Busch 已提交
1109
	int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
M
Matthew Wilcox 已提交
1110

1111
	spin_lock_irq(&nvmeq->q_lock);
1112 1113
	if (nvmeq->q_suspended) {
		spin_unlock_irq(&nvmeq->q_lock);
K
Keith Busch 已提交
1114
		return 1;
1115
	}
1116
	nvmeq->q_suspended = 1;
1117 1118
	spin_unlock_irq(&nvmeq->q_lock);

M
Matthew Wilcox 已提交
1119 1120
	irq_set_affinity_hint(vector, NULL);
	free_irq(vector, nvmeq);
M
Matthew Wilcox 已提交
1121

K
Keith Busch 已提交
1122 1123
	return 0;
}
M
Matthew Wilcox 已提交
1124

K
Keith Busch 已提交
1125 1126
static void nvme_clear_queue(struct nvme_queue *nvmeq)
{
1127 1128 1129 1130
	spin_lock_irq(&nvmeq->q_lock);
	nvme_process_cq(nvmeq);
	nvme_cancel_ios(nvmeq, false);
	spin_unlock_irq(&nvmeq->q_lock);
M
Matthew Wilcox 已提交
1131 1132
}

K
Keith Busch 已提交
1133 1134
static void nvme_disable_queue(struct nvme_dev *dev, int qid)
{
1135
	struct nvme_queue *nvmeq = raw_nvmeq(dev, qid);
K
Keith Busch 已提交
1136 1137 1138 1139 1140 1141

	if (!nvmeq)
		return;
	if (nvme_suspend_queue(nvmeq))
		return;

K
Keith Busch 已提交
1142 1143 1144
	/* Don't tell the adapter to delete the admin queue.
	 * Don't tell a removed adapter to delete IO queues. */
	if (qid && readl(&dev->bar->csts) != -1) {
M
Matthew Wilcox 已提交
1145 1146 1147
		adapter_delete_sq(dev, qid);
		adapter_delete_cq(dev, qid);
	}
K
Keith Busch 已提交
1148
	nvme_clear_queue(nvmeq);
M
Matthew Wilcox 已提交
1149 1150 1151 1152 1153 1154
}

static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
							int depth, int vector)
{
	struct device *dmadev = &dev->pci_dev->dev;
1155
	unsigned extra = nvme_queue_extra(depth);
M
Matthew Wilcox 已提交
1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
	struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL);
	if (!nvmeq)
		return NULL;

	nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth),
					&nvmeq->cq_dma_addr, GFP_KERNEL);
	if (!nvmeq->cqes)
		goto free_nvmeq;
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth));

	nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
					&nvmeq->sq_dma_addr, GFP_KERNEL);
	if (!nvmeq->sq_cmds)
		goto free_cqdma;

	nvmeq->q_dmadev = dmadev;
M
Matthew Wilcox 已提交
1172
	nvmeq->dev = dev;
1173 1174
	snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
			dev->instance, qid);
M
Matthew Wilcox 已提交
1175 1176
	spin_lock_init(&nvmeq->q_lock);
	nvmeq->cq_head = 0;
M
Matthew Wilcox 已提交
1177
	nvmeq->cq_phase = 1;
M
Matthew Wilcox 已提交
1178
	init_waitqueue_head(&nvmeq->sq_full);
1179
	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
M
Matthew Wilcox 已提交
1180
	bio_list_init(&nvmeq->sq_cong);
1181
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
M
Matthew Wilcox 已提交
1182 1183
	nvmeq->q_depth = depth;
	nvmeq->cq_vector = vector;
K
Keith Busch 已提交
1184
	nvmeq->qid = qid;
1185 1186
	nvmeq->q_suspended = 1;
	dev->queue_count++;
1187
	rcu_assign_pointer(dev->queues[qid], nvmeq);
M
Matthew Wilcox 已提交
1188 1189 1190 1191

	return nvmeq;

 free_cqdma:
1192
	dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
M
Matthew Wilcox 已提交
1193 1194 1195 1196 1197 1198
							nvmeq->cq_dma_addr);
 free_nvmeq:
	kfree(nvmeq);
	return NULL;
}

1199 1200 1201
static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
							const char *name)
{
1202 1203
	if (use_threaded_interrupts)
		return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
1204
					nvme_irq_check, nvme_irq, IRQF_SHARED,
1205
					name, nvmeq);
1206
	return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
1207
				IRQF_SHARED, name, nvmeq);
1208 1209
}

1210
static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
M
Matthew Wilcox 已提交
1211
{
1212 1213
	struct nvme_dev *dev = nvmeq->dev;
	unsigned extra = nvme_queue_extra(nvmeq->q_depth);
M
Matthew Wilcox 已提交
1214

1215 1216 1217
	nvmeq->sq_tail = 0;
	nvmeq->cq_head = 0;
	nvmeq->cq_phase = 1;
1218
	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
1219 1220 1221 1222 1223 1224 1225 1226 1227 1228
	memset(nvmeq->cmdid_data, 0, extra);
	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
	nvme_cancel_ios(nvmeq, false);
	nvmeq->q_suspended = 0;
}

static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
{
	struct nvme_dev *dev = nvmeq->dev;
	int result;
1229

M
Matthew Wilcox 已提交
1230 1231
	result = adapter_alloc_cq(dev, qid, nvmeq);
	if (result < 0)
1232
		return result;
M
Matthew Wilcox 已提交
1233 1234 1235 1236 1237

	result = adapter_alloc_sq(dev, qid, nvmeq);
	if (result < 0)
		goto release_cq;

1238
	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
M
Matthew Wilcox 已提交
1239 1240 1241
	if (result < 0)
		goto release_sq;

M
Matthew Wilcox 已提交
1242
	spin_lock_irq(&nvmeq->q_lock);
1243
	nvme_init_queue(nvmeq, qid);
M
Matthew Wilcox 已提交
1244
	spin_unlock_irq(&nvmeq->q_lock);
1245 1246

	return result;
M
Matthew Wilcox 已提交
1247 1248 1249 1250 1251

 release_sq:
	adapter_delete_sq(dev, qid);
 release_cq:
	adapter_delete_cq(dev, qid);
1252
	return result;
M
Matthew Wilcox 已提交
1253 1254
}

1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
{
	unsigned long timeout;
	u32 bit = enabled ? NVME_CSTS_RDY : 0;

	timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;

	while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
		msleep(100);
		if (fatal_signal_pending(current))
			return -EINTR;
		if (time_after(jiffies, timeout)) {
			dev_err(&dev->pci_dev->dev,
				"Device not ready; aborting initialisation\n");
			return -ENODEV;
		}
	}

	return 0;
}

/*
 * If the device has been passed off to us in an enabled state, just clear
 * the enabled bit.  The spec says we should set the 'shutdown notification
 * bits', but doing so may cause the device to complete commands to the
 * admin queue ... and we don't know what memory that might be pointing at!
 */
static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
{
1284 1285 1286 1287
	u32 cc = readl(&dev->bar->cc);

	if (cc & NVME_CC_ENABLE)
		writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc);
1288 1289 1290 1291 1292 1293 1294 1295
	return nvme_wait_ready(dev, cap, false);
}

static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
{
	return nvme_wait_ready(dev, cap, true);
}

K
Keith Busch 已提交
1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319
static int nvme_shutdown_ctrl(struct nvme_dev *dev)
{
	unsigned long timeout;
	u32 cc;

	cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL;
	writel(cc, &dev->bar->cc);

	timeout = 2 * HZ + jiffies;
	while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
							NVME_CSTS_SHST_CMPLT) {
		msleep(100);
		if (fatal_signal_pending(current))
			return -EINTR;
		if (time_after(jiffies, timeout)) {
			dev_err(&dev->pci_dev->dev,
				"Device shutdown incomplete; abort shutdown\n");
			return -ENODEV;
		}
	}

	return 0;
}

1320
static int nvme_configure_admin_queue(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1321
{
1322
	int result;
M
Matthew Wilcox 已提交
1323
	u32 aqa;
1324
	u64 cap = readq(&dev->bar->cap);
M
Matthew Wilcox 已提交
1325 1326
	struct nvme_queue *nvmeq;

1327 1328 1329
	result = nvme_disable_ctrl(dev, cap);
	if (result < 0)
		return result;
M
Matthew Wilcox 已提交
1330

1331
	nvmeq = raw_nvmeq(dev, 0);
1332 1333 1334 1335 1336
	if (!nvmeq) {
		nvmeq = nvme_alloc_queue(dev, 0, 64, 0);
		if (!nvmeq)
			return -ENOMEM;
	}
M
Matthew Wilcox 已提交
1337 1338 1339 1340 1341 1342 1343

	aqa = nvmeq->q_depth - 1;
	aqa |= aqa << 16;

	dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM;
	dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT;
	dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
1344
	dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
M
Matthew Wilcox 已提交
1345 1346 1347 1348 1349 1350

	writel(aqa, &dev->bar->aqa);
	writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
	writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
	writel(dev->ctrl_config, &dev->bar->cc);

1351
	result = nvme_enable_ctrl(dev, cap);
1352
	if (result)
1353
		return result;
1354

1355
	result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
1356
	if (result)
1357
		return result;
1358

M
Matthew Wilcox 已提交
1359
	spin_lock_irq(&nvmeq->q_lock);
1360
	nvme_init_queue(nvmeq, 0);
M
Matthew Wilcox 已提交
1361
	spin_unlock_irq(&nvmeq->q_lock);
M
Matthew Wilcox 已提交
1362 1363 1364
	return result;
}

V
Vishal Verma 已提交
1365
struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
1366
				unsigned long addr, unsigned length)
M
Matthew Wilcox 已提交
1367
{
1368
	int i, err, count, nents, offset;
1369 1370
	struct scatterlist *sg;
	struct page **pages;
1371
	struct nvme_iod *iod;
1372 1373

	if (addr & 3)
1374
		return ERR_PTR(-EINVAL);
1375
	if (!length || length > INT_MAX - PAGE_SIZE)
1376
		return ERR_PTR(-EINVAL);
1377

1378
	offset = offset_in_page(addr);
1379 1380
	count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
	pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
1381 1382
	if (!pages)
		return ERR_PTR(-ENOMEM);
1383 1384 1385 1386 1387 1388 1389

	err = get_user_pages_fast(addr, count, 1, pages);
	if (err < count) {
		count = err;
		err = -EFAULT;
		goto put_pages;
	}
1390

1391 1392
	iod = nvme_alloc_iod(count, length, GFP_KERNEL);
	sg = iod->sg;
1393
	sg_init_table(sg, count);
1394 1395
	for (i = 0; i < count; i++) {
		sg_set_page(&sg[i], pages[i],
1396 1397
			    min_t(unsigned, length, PAGE_SIZE - offset),
			    offset);
1398 1399
		length -= (PAGE_SIZE - offset);
		offset = 0;
1400
	}
1401
	sg_mark_end(&sg[i - 1]);
1402
	iod->nents = count;
1403 1404 1405 1406

	err = -ENOMEM;
	nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1407
	if (!nents)
1408
		goto free_iod;
M
Matthew Wilcox 已提交
1409

1410
	kfree(pages);
1411
	return iod;
M
Matthew Wilcox 已提交
1412

1413 1414
 free_iod:
	kfree(iod);
1415 1416 1417 1418
 put_pages:
	for (i = 0; i < count; i++)
		put_page(pages[i]);
	kfree(pages);
1419
	return ERR_PTR(err);
1420
}
M
Matthew Wilcox 已提交
1421

V
Vishal Verma 已提交
1422
void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
1423
			struct nvme_iod *iod)
1424
{
1425
	int i;
M
Matthew Wilcox 已提交
1426

1427 1428
	dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
				write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
1429

1430 1431
	for (i = 0; i < iod->nents; i++)
		put_page(sg_page(&iod->sg[i]));
1432
}
M
Matthew Wilcox 已提交
1433

M
Matthew Wilcox 已提交
1434 1435 1436 1437 1438 1439
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
	struct nvme_dev *dev = ns->dev;
	struct nvme_queue *nvmeq;
	struct nvme_user_io io;
	struct nvme_command c;
1440 1441 1442 1443 1444
	unsigned length, meta_len;
	int status, i;
	struct nvme_iod *iod, *meta_iod = NULL;
	dma_addr_t meta_dma_addr;
	void *meta, *uninitialized_var(meta_mem);
M
Matthew Wilcox 已提交
1445 1446 1447

	if (copy_from_user(&io, uio, sizeof(io)))
		return -EFAULT;
1448
	length = (io.nblocks + 1) << ns->lba_shift;
1449 1450 1451 1452
	meta_len = (io.nblocks + 1) * ns->ms;

	if (meta_len && ((io.metadata & 3) || !io.metadata))
		return -EINVAL;
1453 1454 1455 1456

	switch (io.opcode) {
	case nvme_cmd_write:
	case nvme_cmd_read:
M
Matthew Wilcox 已提交
1457
	case nvme_cmd_compare:
1458
		iod = nvme_map_user_pages(dev, io.opcode & 1, io.addr, length);
M
Matthew Wilcox 已提交
1459
		break;
1460
	default:
M
Matthew Wilcox 已提交
1461
		return -EINVAL;
1462 1463
	}

1464 1465
	if (IS_ERR(iod))
		return PTR_ERR(iod);
M
Matthew Wilcox 已提交
1466 1467 1468 1469

	memset(&c, 0, sizeof(c));
	c.rw.opcode = io.opcode;
	c.rw.flags = io.flags;
1470
	c.rw.nsid = cpu_to_le32(ns->ns_id);
M
Matthew Wilcox 已提交
1471
	c.rw.slba = cpu_to_le64(io.slba);
1472
	c.rw.length = cpu_to_le16(io.nblocks);
M
Matthew Wilcox 已提交
1473
	c.rw.control = cpu_to_le16(io.control);
1474 1475 1476 1477
	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
	c.rw.reftag = cpu_to_le32(io.reftag);
	c.rw.apptag = cpu_to_le16(io.apptag);
	c.rw.appmask = cpu_to_le16(io.appmask);
1478 1479

	if (meta_len) {
K
Keith Busch 已提交
1480 1481
		meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata,
								meta_len);
1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510
		if (IS_ERR(meta_iod)) {
			status = PTR_ERR(meta_iod);
			meta_iod = NULL;
			goto unmap;
		}

		meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
						&meta_dma_addr, GFP_KERNEL);
		if (!meta_mem) {
			status = -ENOMEM;
			goto unmap;
		}

		if (io.opcode & 1) {
			int meta_offset = 0;

			for (i = 0; i < meta_iod->nents; i++) {
				meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
						meta_iod->sg[i].offset;
				memcpy(meta_mem + meta_offset, meta,
						meta_iod->sg[i].length);
				kunmap_atomic(meta);
				meta_offset += meta_iod->sg[i].length;
			}
		}

		c.rw.metadata = cpu_to_le64(meta_dma_addr);
	}

1511
	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
M
Matthew Wilcox 已提交
1512

1513
	nvmeq = get_nvmeq(dev);
M
Matthew Wilcox 已提交
1514 1515
	/*
	 * Since nvme_submit_sync_cmd sleeps, we can't keep preemption
1516 1517 1518 1519
	 * disabled.  We may be preempted at any point, and be rescheduled
	 * to a different CPU.  That will cause cacheline bouncing, but no
	 * additional races since q_lock already protects against other CPUs.
	 */
M
Matthew Wilcox 已提交
1520
	put_nvmeq(nvmeq);
1521 1522
	if (length != (io.nblocks + 1) << ns->lba_shift)
		status = -ENOMEM;
1523 1524
	else if (!nvmeq || nvmeq->q_suspended)
		status = -EBUSY;
1525
	else
1526
		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
M
Matthew Wilcox 已提交
1527

1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546
	if (meta_len) {
		if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
			int meta_offset = 0;

			for (i = 0; i < meta_iod->nents; i++) {
				meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
						meta_iod->sg[i].offset;
				memcpy(meta, meta_mem + meta_offset,
						meta_iod->sg[i].length);
				kunmap_atomic(meta);
				meta_offset += meta_iod->sg[i].length;
			}
		}

		dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem,
								meta_dma_addr);
	}

 unmap:
1547
	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
1548
	nvme_free_iod(dev, iod);
1549 1550 1551 1552 1553 1554

	if (meta_iod) {
		nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod);
		nvme_free_iod(dev, meta_iod);
	}

M
Matthew Wilcox 已提交
1555 1556 1557
	return status;
}

1558
static int nvme_user_admin_cmd(struct nvme_dev *dev,
M
Matthew Wilcox 已提交
1559
					struct nvme_admin_cmd __user *ucmd)
1560
{
M
Matthew Wilcox 已提交
1561
	struct nvme_admin_cmd cmd;
1562
	struct nvme_command c;
1563
	int status, length;
1564
	struct nvme_iod *uninitialized_var(iod);
1565
	unsigned timeout;
1566

M
Matthew Wilcox 已提交
1567 1568 1569
	if (!capable(CAP_SYS_ADMIN))
		return -EACCES;
	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
1570 1571 1572
		return -EFAULT;

	memset(&c, 0, sizeof(c));
M
Matthew Wilcox 已提交
1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586
	c.common.opcode = cmd.opcode;
	c.common.flags = cmd.flags;
	c.common.nsid = cpu_to_le32(cmd.nsid);
	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);

	length = cmd.data_len;
	if (cmd.data_len) {
1587 1588
		iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
								length);
1589 1590 1591 1592
		if (IS_ERR(iod))
			return PTR_ERR(iod);
		length = nvme_setup_prps(dev, &c.common, iod, length,
								GFP_KERNEL);
M
Matthew Wilcox 已提交
1593 1594
	}

1595 1596
	timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
								ADMIN_TIMEOUT;
M
Matthew Wilcox 已提交
1597
	if (length != cmd.data_len)
1598 1599
		status = -ENOMEM;
	else
1600 1601
		status = nvme_submit_sync_cmd(raw_nvmeq(dev, 0), &c,
							&cmd.result, timeout);
1602

M
Matthew Wilcox 已提交
1603
	if (cmd.data_len) {
1604
		nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
1605
		nvme_free_iod(dev, iod);
M
Matthew Wilcox 已提交
1606
	}
1607

1608
	if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
1609 1610 1611
							sizeof(cmd.result)))
		status = -EFAULT;

1612 1613 1614
	return status;
}

M
Matthew Wilcox 已提交
1615 1616 1617 1618 1619 1620
static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
							unsigned long arg)
{
	struct nvme_ns *ns = bdev->bd_disk->private_data;

	switch (cmd) {
M
Matthew Wilcox 已提交
1621
	case NVME_IOCTL_ID:
1622
		force_successful_syscall_return();
M
Matthew Wilcox 已提交
1623 1624
		return ns->ns_id;
	case NVME_IOCTL_ADMIN_CMD:
1625
		return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
M
Matthew Wilcox 已提交
1626 1627
	case NVME_IOCTL_SUBMIT_IO:
		return nvme_submit_io(ns, (void __user *)arg);
V
Vishal Verma 已提交
1628 1629 1630 1631
	case SG_GET_VERSION_NUM:
		return nvme_sg_get_version_num((void __user *)arg);
	case SG_IO:
		return nvme_sg_io(ns, (void __user *)arg);
M
Matthew Wilcox 已提交
1632 1633 1634 1635 1636
	default:
		return -ENOTTY;
	}
}

K
Keith Busch 已提交
1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652
#ifdef CONFIG_COMPAT
static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
					unsigned int cmd, unsigned long arg)
{
	struct nvme_ns *ns = bdev->bd_disk->private_data;

	switch (cmd) {
	case SG_IO:
		return nvme_sg_io32(ns, arg);
	}
	return nvme_ioctl(bdev, mode, cmd, arg);
}
#else
#define nvme_compat_ioctl	NULL
#endif

1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671
static int nvme_open(struct block_device *bdev, fmode_t mode)
{
	struct nvme_ns *ns = bdev->bd_disk->private_data;
	struct nvme_dev *dev = ns->dev;

	kref_get(&dev->kref);
	return 0;
}

static void nvme_free_dev(struct kref *kref);

static void nvme_release(struct gendisk *disk, fmode_t mode)
{
	struct nvme_ns *ns = disk->private_data;
	struct nvme_dev *dev = ns->dev;

	kref_put(&dev->kref, nvme_free_dev);
}

M
Matthew Wilcox 已提交
1672 1673 1674
static const struct block_device_operations nvme_fops = {
	.owner		= THIS_MODULE,
	.ioctl		= nvme_ioctl,
K
Keith Busch 已提交
1675
	.compat_ioctl	= nvme_compat_ioctl,
1676 1677
	.open		= nvme_open,
	.release	= nvme_release,
M
Matthew Wilcox 已提交
1678 1679
};

1680 1681 1682 1683 1684
static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
{
	while (bio_list_peek(&nvmeq->sq_cong)) {
		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
1685 1686 1687 1688

		if (bio_list_empty(&nvmeq->sq_cong))
			remove_wait_queue(&nvmeq->sq_full,
							&nvmeq->sq_cong_wait);
1689
		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
1690 1691 1692
			if (bio_list_empty(&nvmeq->sq_cong))
				add_wait_queue(&nvmeq->sq_full,
							&nvmeq->sq_cong_wait);
1693 1694 1695 1696 1697 1698 1699 1700
			bio_list_add_head(&nvmeq->sq_cong, bio);
			break;
		}
	}
}

static int nvme_kthread(void *data)
{
1701
	struct nvme_dev *dev, *next;
1702 1703

	while (!kthread_should_stop()) {
1704
		set_current_state(TASK_INTERRUPTIBLE);
1705
		spin_lock(&dev_list_lock);
1706
		list_for_each_entry_safe(dev, next, &dev_list, node) {
1707
			int i;
1708 1709 1710 1711 1712 1713 1714
			if (readl(&dev->bar->csts) & NVME_CSTS_CFS &&
							dev->initialized) {
				if (work_busy(&dev->reset_work))
					continue;
				list_del_init(&dev->node);
				dev_warn(&dev->pci_dev->dev,
					"Failed status, reset controller\n");
M
Matthew Wilcox 已提交
1715
				PREPARE_WORK(&dev->reset_work,
1716 1717 1718 1719
							nvme_reset_failed_dev);
				queue_work(nvme_workq, &dev->reset_work);
				continue;
			}
1720
			rcu_read_lock();
1721
			for (i = 0; i < dev->queue_count; i++) {
1722 1723
				struct nvme_queue *nvmeq =
						rcu_dereference(dev->queues[i]);
1724 1725
				if (!nvmeq)
					continue;
1726
				spin_lock_irq(&nvmeq->q_lock);
1727 1728
				if (nvmeq->q_suspended)
					goto unlock;
1729
				nvme_process_cq(nvmeq);
1730
				nvme_cancel_ios(nvmeq, true);
1731
				nvme_resubmit_bios(nvmeq);
1732
 unlock:
1733 1734
				spin_unlock_irq(&nvmeq->q_lock);
			}
1735
			rcu_read_unlock();
1736 1737
		}
		spin_unlock(&dev_list_lock);
1738
		schedule_timeout(round_jiffies_relative(HZ));
1739 1740 1741 1742
	}
	return 0;
}

1743 1744 1745 1746 1747 1748 1749 1750 1751 1752
static void nvme_config_discard(struct nvme_ns *ns)
{
	u32 logical_block_size = queue_logical_block_size(ns->queue);
	ns->queue->limits.discard_zeroes_data = 0;
	ns->queue->limits.discard_alignment = logical_block_size;
	ns->queue->limits.discard_granularity = logical_block_size;
	ns->queue->limits.max_discard_sectors = 0xffffffff;
	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
}

1753
static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
M
Matthew Wilcox 已提交
1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768
			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
{
	struct nvme_ns *ns;
	struct gendisk *disk;
	int lbaf;

	if (rt->attributes & NVME_LBART_ATTRIB_HIDE)
		return NULL;

	ns = kzalloc(sizeof(*ns), GFP_KERNEL);
	if (!ns)
		return NULL;
	ns->queue = blk_alloc_queue(GFP_KERNEL);
	if (!ns->queue)
		goto out_free_ns;
M
Matthew Wilcox 已提交
1769 1770 1771
	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
M
Matthew Wilcox 已提交
1772 1773 1774 1775
	blk_queue_make_request(ns->queue, nvme_make_request);
	ns->dev = dev;
	ns->queue->queuedata = ns;

1776
	disk = alloc_disk(0);
M
Matthew Wilcox 已提交
1777 1778
	if (!disk)
		goto out_free_queue;
1779
	ns->ns_id = nsid;
M
Matthew Wilcox 已提交
1780 1781 1782
	ns->disk = disk;
	lbaf = id->flbas & 0xf;
	ns->lba_shift = id->lbaf[lbaf].ds;
1783
	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
1784
	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
K
Keith Busch 已提交
1785 1786
	if (dev->max_hw_sectors)
		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
M
Matthew Wilcox 已提交
1787 1788

	disk->major = nvme_major;
1789
	disk->first_minor = 0;
M
Matthew Wilcox 已提交
1790 1791 1792
	disk->fops = &nvme_fops;
	disk->private_data = ns;
	disk->queue = ns->queue;
1793
	disk->driverfs_dev = &dev->pci_dev->dev;
1794
	disk->flags = GENHD_FL_EXT_DEVT;
1795
	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
M
Matthew Wilcox 已提交
1796 1797
	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));

1798 1799 1800
	if (dev->oncs & NVME_CTRL_ONCS_DSM)
		nvme_config_discard(ns);

M
Matthew Wilcox 已提交
1801 1802 1803 1804 1805 1806 1807 1808 1809
	return ns;

 out_free_queue:
	blk_cleanup_queue(ns->queue);
 out_free_ns:
	kfree(ns);
	return NULL;
}

1810
static int set_queue_count(struct nvme_dev *dev, int count)
M
Matthew Wilcox 已提交
1811 1812 1813
{
	int status;
	u32 result;
1814
	u32 q_count = (count - 1) | ((count - 1) << 16);
M
Matthew Wilcox 已提交
1815

1816
	status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
1817
								&result);
M
Matthew Wilcox 已提交
1818
	if (status)
1819
		return status < 0 ? -EIO : -EBUSY;
M
Matthew Wilcox 已提交
1820 1821 1822
	return min(result & 0xffff, result >> 16) + 1;
}

K
Keith Busch 已提交
1823 1824
static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
1825
	return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
K
Keith Busch 已提交
1826 1827
}

1828
static int nvme_setup_io_queues(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1829
{
1830
	struct nvme_queue *adminq = raw_nvmeq(dev, 0);
R
Ramachandra Rao Gajula 已提交
1831
	struct pci_dev *pdev = dev->pci_dev;
K
Keith Busch 已提交
1832
	int result, cpu, i, vecs, nr_io_queues, size, q_depth;
M
Matthew Wilcox 已提交
1833

1834 1835
	nr_io_queues = num_online_cpus();
	result = set_queue_count(dev, nr_io_queues);
M
Matthew Wilcox 已提交
1836 1837
	if (result < 0)
		return result;
1838 1839
	if (result < nr_io_queues)
		nr_io_queues = result;
M
Matthew Wilcox 已提交
1840

K
Keith Busch 已提交
1841 1842
	size = db_bar_size(dev, nr_io_queues);
	if (size > 8192) {
1843
		iounmap(dev->bar);
K
Keith Busch 已提交
1844 1845 1846 1847 1848 1849 1850 1851
		do {
			dev->bar = ioremap(pci_resource_start(pdev, 0), size);
			if (dev->bar)
				break;
			if (!--nr_io_queues)
				return -ENOMEM;
			size = db_bar_size(dev, nr_io_queues);
		} while (1);
1852
		dev->dbs = ((void __iomem *)dev->bar) + 4096;
1853
		adminq->q_db = dev->dbs;
1854 1855
	}

K
Keith Busch 已提交
1856
	/* Deregister the admin queue's interrupt */
1857
	free_irq(dev->entry[0].vector, adminq);
K
Keith Busch 已提交
1858

1859 1860
	vecs = nr_io_queues;
	for (i = 0; i < vecs; i++)
M
Matthew Wilcox 已提交
1861 1862
		dev->entry[i].entry = i;
	for (;;) {
1863 1864
		result = pci_enable_msix(pdev, dev->entry, vecs);
		if (result <= 0)
M
Matthew Wilcox 已提交
1865
			break;
1866
		vecs = result;
M
Matthew Wilcox 已提交
1867 1868
	}

1869 1870 1871 1872
	if (result < 0) {
		vecs = nr_io_queues;
		if (vecs > 32)
			vecs = 32;
R
Ramachandra Rao Gajula 已提交
1873
		for (;;) {
1874
			result = pci_enable_msi_block(pdev, vecs);
R
Ramachandra Rao Gajula 已提交
1875
			if (result == 0) {
1876
				for (i = 0; i < vecs; i++)
R
Ramachandra Rao Gajula 已提交
1877 1878
					dev->entry[i].vector = i + pdev->irq;
				break;
1879 1880
			} else if (result < 0) {
				vecs = 1;
R
Ramachandra Rao Gajula 已提交
1881 1882
				break;
			}
1883
			vecs = result;
R
Ramachandra Rao Gajula 已提交
1884 1885 1886
		}
	}

1887 1888 1889 1890 1891 1892 1893 1894
	/*
	 * Should investigate if there's a performance win from allocating
	 * more queues than interrupt vectors; it might allow the submission
	 * path to scale better, even if the receive path is limited by the
	 * number of interrupts.
	 */
	nr_io_queues = vecs;

1895
	result = queue_request_irq(dev, adminq, adminq->irqname);
K
Keith Busch 已提交
1896
	if (result) {
1897
		adminq->q_suspended = 1;
1898
		goto free_queues;
K
Keith Busch 已提交
1899
	}
M
Matthew Wilcox 已提交
1900

1901
	/* Free previously allocated queues that are no longer usable */
1902
	nvme_free_queues(dev, nr_io_queues);
1903

M
Matthew Wilcox 已提交
1904
	cpu = cpumask_first(cpu_online_mask);
1905
	for (i = 0; i < nr_io_queues; i++) {
M
Matthew Wilcox 已提交
1906 1907 1908 1909
		irq_set_affinity_hint(dev->entry[i].vector, get_cpu_mask(cpu));
		cpu = cpumask_next(cpu, cpu_online_mask);
	}

1910 1911
	q_depth = min_t(int, NVME_CAP_MQES(readq(&dev->bar->cap)) + 1,
								NVME_Q_DEPTH);
1912
	for (i = dev->queue_count - 1; i < nr_io_queues; i++) {
1913
		if (!nvme_alloc_queue(dev, i + 1, q_depth, i)) {
1914 1915 1916
			result = -ENOMEM;
			goto free_queues;
		}
M
Matthew Wilcox 已提交
1917
	}
M
Matthew Wilcox 已提交
1918

M
Matthew Wilcox 已提交
1919 1920
	for (; i < num_possible_cpus(); i++) {
		int target = i % rounddown_pow_of_two(dev->queue_count - 1);
1921
		rcu_assign_pointer(dev->queues[i + 1], dev->queues[target + 1]);
M
Matthew Wilcox 已提交
1922 1923
	}

1924
	for (i = 1; i < dev->queue_count; i++) {
1925
		result = nvme_create_queue(raw_nvmeq(dev, i), i);
1926 1927 1928 1929 1930 1931
		if (result) {
			for (--i; i > 0; i--)
				nvme_disable_queue(dev, i);
			goto free_queues;
		}
	}
M
Matthew Wilcox 已提交
1932

1933
	return 0;
M
Matthew Wilcox 已提交
1934

1935
 free_queues:
1936
	nvme_free_queues(dev, 1);
1937
	return result;
M
Matthew Wilcox 已提交
1938 1939
}

1940 1941 1942 1943 1944 1945
/*
 * Return: error value if an error occurred setting up the queues or calling
 * Identify Device.  0 if these succeeded, even if adding some of the
 * namespaces failed.  At the moment, these failures are silent.  TBD which
 * failures should be reported.
 */
1946
static int nvme_dev_add(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
1947
{
1948
	struct pci_dev *pdev = dev->pci_dev;
1949 1950
	int res;
	unsigned nn, i;
1951
	struct nvme_ns *ns;
1952
	struct nvme_id_ctrl *ctrl;
1953 1954
	struct nvme_id_ns *id_ns;
	void *mem;
M
Matthew Wilcox 已提交
1955
	dma_addr_t dma_addr;
1956
	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
M
Matthew Wilcox 已提交
1957

1958
	mem = dma_alloc_coherent(&pdev->dev, 8192, &dma_addr, GFP_KERNEL);
1959 1960
	if (!mem)
		return -ENOMEM;
M
Matthew Wilcox 已提交
1961

1962
	res = nvme_identify(dev, 0, 1, dma_addr);
M
Matthew Wilcox 已提交
1963 1964
	if (res) {
		res = -EIO;
1965
		goto out;
M
Matthew Wilcox 已提交
1966 1967
	}

1968
	ctrl = mem;
1969
	nn = le32_to_cpup(&ctrl->nn);
1970
	dev->oncs = le16_to_cpup(&ctrl->oncs);
K
Keith Busch 已提交
1971
	dev->abort_limit = ctrl->acl + 1;
1972 1973 1974
	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
1975
	if (ctrl->mdts)
K
Keith Busch 已提交
1976
		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
1977 1978
	if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
			(pdev->device == 0x0953) && ctrl->vs[3])
1979
		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
M
Matthew Wilcox 已提交
1980

1981
	id_ns = mem;
M
Matthew Wilcox 已提交
1982
	for (i = 1; i <= nn; i++) {
1983
		res = nvme_identify(dev, i, 0, dma_addr);
M
Matthew Wilcox 已提交
1984 1985 1986
		if (res)
			continue;

1987
		if (id_ns->ncap == 0)
M
Matthew Wilcox 已提交
1988 1989
			continue;

1990
		res = nvme_get_features(dev, NVME_FEAT_LBA_RANGE, i,
1991
							dma_addr + 4096, NULL);
M
Matthew Wilcox 已提交
1992
		if (res)
1993
			memset(mem + 4096, 0, 4096);
M
Matthew Wilcox 已提交
1994

1995
		ns = nvme_alloc_ns(dev, i, mem, mem + 4096);
M
Matthew Wilcox 已提交
1996 1997 1998 1999 2000
		if (ns)
			list_add_tail(&ns->list, &dev->namespaces);
	}
	list_for_each_entry(ns, &dev->namespaces, list)
		add_disk(ns->disk);
2001
	res = 0;
M
Matthew Wilcox 已提交
2002

2003
 out:
2004
	dma_free_coherent(&dev->pci_dev->dev, 8192, mem, dma_addr);
M
Matthew Wilcox 已提交
2005 2006 2007
	return res;
}

2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021
static int nvme_dev_map(struct nvme_dev *dev)
{
	int bars, result = -ENOMEM;
	struct pci_dev *pdev = dev->pci_dev;

	if (pci_enable_device_mem(pdev))
		return result;

	dev->entry[0].vector = pdev->irq;
	pci_set_master(pdev);
	bars = pci_select_bars(pdev, IORESOURCE_MEM);
	if (pci_request_selected_regions(pdev, bars, "nvme"))
		goto disable_pci;

2022 2023 2024
	if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
	    dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
		goto disable;
2025 2026 2027 2028

	dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
	if (!dev->bar)
		goto disable;
K
Keith Busch 已提交
2029 2030 2031 2032
	if (readl(&dev->bar->csts) == -1) {
		result = -ENODEV;
		goto unmap;
	}
2033
	dev->db_stride = 1 << NVME_CAP_STRIDE(readq(&dev->bar->cap));
2034 2035 2036 2037
	dev->dbs = ((void __iomem *)dev->bar) + 4096;

	return 0;

K
Keith Busch 已提交
2038 2039 2040
 unmap:
	iounmap(dev->bar);
	dev->bar = NULL;
2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057
 disable:
	pci_release_regions(pdev);
 disable_pci:
	pci_disable_device(pdev);
	return result;
}

static void nvme_dev_unmap(struct nvme_dev *dev)
{
	if (dev->pci_dev->msi_enabled)
		pci_disable_msi(dev->pci_dev);
	else if (dev->pci_dev->msix_enabled)
		pci_disable_msix(dev->pci_dev);

	if (dev->bar) {
		iounmap(dev->bar);
		dev->bar = NULL;
K
Keith Busch 已提交
2058
		pci_release_regions(dev->pci_dev);
2059 2060 2061 2062 2063 2064
	}

	if (pci_is_enabled(dev->pci_dev))
		pci_disable_device(dev->pci_dev);
}

K
Keith Busch 已提交
2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188
struct nvme_delq_ctx {
	struct task_struct *waiter;
	struct kthread_worker *worker;
	atomic_t refcount;
};

static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
{
	dq->waiter = current;
	mb();

	for (;;) {
		set_current_state(TASK_KILLABLE);
		if (!atomic_read(&dq->refcount))
			break;
		if (!schedule_timeout(ADMIN_TIMEOUT) ||
					fatal_signal_pending(current)) {
			set_current_state(TASK_RUNNING);

			nvme_disable_ctrl(dev, readq(&dev->bar->cap));
			nvme_disable_queue(dev, 0);

			send_sig(SIGKILL, dq->worker->task, 1);
			flush_kthread_worker(dq->worker);
			return;
		}
	}
	set_current_state(TASK_RUNNING);
}

static void nvme_put_dq(struct nvme_delq_ctx *dq)
{
	atomic_dec(&dq->refcount);
	if (dq->waiter)
		wake_up_process(dq->waiter);
}

static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
{
	atomic_inc(&dq->refcount);
	return dq;
}

static void nvme_del_queue_end(struct nvme_queue *nvmeq)
{
	struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;

	nvme_clear_queue(nvmeq);
	nvme_put_dq(dq);
}

static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
						kthread_work_func_t fn)
{
	struct nvme_command c;

	memset(&c, 0, sizeof(c));
	c.delete_queue.opcode = opcode;
	c.delete_queue.qid = cpu_to_le16(nvmeq->qid);

	init_kthread_work(&nvmeq->cmdinfo.work, fn);
	return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo);
}

static void nvme_del_cq_work_handler(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	nvme_del_queue_end(nvmeq);
}

static int nvme_delete_cq(struct nvme_queue *nvmeq)
{
	return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
						nvme_del_cq_work_handler);
}

static void nvme_del_sq_work_handler(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	int status = nvmeq->cmdinfo.status;

	if (!status)
		status = nvme_delete_cq(nvmeq);
	if (status)
		nvme_del_queue_end(nvmeq);
}

static int nvme_delete_sq(struct nvme_queue *nvmeq)
{
	return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
						nvme_del_sq_work_handler);
}

static void nvme_del_queue_start(struct kthread_work *work)
{
	struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
							cmdinfo.work);
	allow_signal(SIGKILL);
	if (nvme_delete_sq(nvmeq))
		nvme_del_queue_end(nvmeq);
}

static void nvme_disable_io_queues(struct nvme_dev *dev)
{
	int i;
	DEFINE_KTHREAD_WORKER_ONSTACK(worker);
	struct nvme_delq_ctx dq;
	struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
					&worker, "nvme%d", dev->instance);

	if (IS_ERR(kworker_task)) {
		dev_err(&dev->pci_dev->dev,
			"Failed to create queue del task\n");
		for (i = dev->queue_count - 1; i > 0; i--)
			nvme_disable_queue(dev, i);
		return;
	}

	dq.waiter = NULL;
	atomic_set(&dq.refcount, 0);
	dq.worker = &worker;
	for (i = dev->queue_count - 1; i > 0; i--) {
2189
		struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
K
Keith Busch 已提交
2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201

		if (nvme_suspend_queue(nvmeq))
			continue;
		nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
		nvmeq->cmdinfo.worker = dq.worker;
		init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
		queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
	}
	nvme_wait_dq(&dq, dev);
	kthread_stop(kworker_task);
}

2202
static void nvme_dev_shutdown(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2203
{
2204 2205
	int i;

2206
	dev->initialized = 0;
M
Matthew Wilcox 已提交
2207

2208
	spin_lock(&dev_list_lock);
2209
	list_del_init(&dev->node);
2210 2211
	spin_unlock(&dev_list_lock);

K
Keith Busch 已提交
2212 2213
	if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) {
		for (i = dev->queue_count - 1; i >= 0; i--) {
2214
			struct nvme_queue *nvmeq = raw_nvmeq(dev, i);
K
Keith Busch 已提交
2215 2216 2217 2218 2219
			nvme_suspend_queue(nvmeq);
			nvme_clear_queue(nvmeq);
		}
	} else {
		nvme_disable_io_queues(dev);
K
Keith Busch 已提交
2220
		nvme_shutdown_ctrl(dev);
K
Keith Busch 已提交
2221 2222
		nvme_disable_queue(dev, 0);
	}
2223 2224 2225 2226 2227
	nvme_dev_unmap(dev);
}

static void nvme_dev_remove(struct nvme_dev *dev)
{
2228
	struct nvme_ns *ns;
2229

2230 2231 2232 2233 2234
	list_for_each_entry(ns, &dev->namespaces, list) {
		if (ns->disk->flags & GENHD_FL_UP)
			del_gendisk(ns->disk);
		if (!blk_queue_dying(ns->queue))
			blk_cleanup_queue(ns->queue);
M
Matthew Wilcox 已提交
2235 2236 2237
	}
}

M
Matthew Wilcox 已提交
2238 2239 2240 2241 2242 2243 2244 2245
static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
	struct device *dmadev = &dev->pci_dev->dev;
	dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
						PAGE_SIZE, PAGE_SIZE, 0);
	if (!dev->prp_page_pool)
		return -ENOMEM;

2246 2247 2248 2249 2250 2251 2252
	/* Optimisation for I/Os between 4k and 128k */
	dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
						256, 256, 0);
	if (!dev->prp_small_pool) {
		dma_pool_destroy(dev->prp_page_pool);
		return -ENOMEM;
	}
M
Matthew Wilcox 已提交
2253 2254 2255 2256 2257 2258
	return 0;
}

static void nvme_release_prp_pools(struct nvme_dev *dev)
{
	dma_pool_destroy(dev->prp_page_pool);
2259
	dma_pool_destroy(dev->prp_small_pool);
M
Matthew Wilcox 已提交
2260 2261
}

2262 2263 2264
static DEFINE_IDA(nvme_instance_ida);

static int nvme_set_instance(struct nvme_dev *dev)
M
Matthew Wilcox 已提交
2265
{
2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281
	int instance, error;

	do {
		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
			return -ENODEV;

		spin_lock(&dev_list_lock);
		error = ida_get_new(&nvme_instance_ida, &instance);
		spin_unlock(&dev_list_lock);
	} while (error == -EAGAIN);

	if (error)
		return -ENODEV;

	dev->instance = instance;
	return 0;
M
Matthew Wilcox 已提交
2282 2283 2284 2285
}

static void nvme_release_instance(struct nvme_dev *dev)
{
2286 2287 2288
	spin_lock(&dev_list_lock);
	ida_remove(&nvme_instance_ida, dev->instance);
	spin_unlock(&dev_list_lock);
M
Matthew Wilcox 已提交
2289 2290
}

2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301
static void nvme_free_namespaces(struct nvme_dev *dev)
{
	struct nvme_ns *ns, *next;

	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
		list_del(&ns->list);
		put_disk(ns->disk);
		kfree(ns);
	}
}

2302 2303 2304
static void nvme_free_dev(struct kref *kref)
{
	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
2305 2306

	nvme_free_namespaces(dev);
2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346
	kfree(dev->queues);
	kfree(dev->entry);
	kfree(dev);
}

static int nvme_dev_open(struct inode *inode, struct file *f)
{
	struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev,
								miscdev);
	kref_get(&dev->kref);
	f->private_data = dev;
	return 0;
}

static int nvme_dev_release(struct inode *inode, struct file *f)
{
	struct nvme_dev *dev = f->private_data;
	kref_put(&dev->kref, nvme_free_dev);
	return 0;
}

static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
{
	struct nvme_dev *dev = f->private_data;
	switch (cmd) {
	case NVME_IOCTL_ADMIN_CMD:
		return nvme_user_admin_cmd(dev, (void __user *)arg);
	default:
		return -ENOTTY;
	}
}

static const struct file_operations nvme_dev_fops = {
	.owner		= THIS_MODULE,
	.open		= nvme_dev_open,
	.release	= nvme_dev_release,
	.unlocked_ioctl	= nvme_dev_ioctl,
	.compat_ioctl	= nvme_dev_ioctl,
};

2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363
static int nvme_dev_start(struct nvme_dev *dev)
{
	int result;

	result = nvme_dev_map(dev);
	if (result)
		return result;

	result = nvme_configure_admin_queue(dev);
	if (result)
		goto unmap;

	spin_lock(&dev_list_lock);
	list_add(&dev->node, &dev_list);
	spin_unlock(&dev_list_lock);

	result = nvme_setup_io_queues(dev);
2364
	if (result && result != -EBUSY)
2365 2366
		goto disable;

2367
	return result;
2368 2369

 disable:
2370
	nvme_disable_queue(dev, 0);
2371 2372 2373 2374 2375 2376 2377 2378
	spin_lock(&dev_list_lock);
	list_del_init(&dev->node);
	spin_unlock(&dev_list_lock);
 unmap:
	nvme_dev_unmap(dev);
	return result;
}

K
Keith Busch 已提交
2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394
static int nvme_remove_dead_ctrl(void *arg)
{
	struct nvme_dev *dev = (struct nvme_dev *)arg;
	struct pci_dev *pdev = dev->pci_dev;

	if (pci_get_drvdata(pdev))
		pci_stop_and_remove_bus_device(pdev);
	kref_put(&dev->kref, nvme_free_dev);
	return 0;
}

static void nvme_remove_disks(struct work_struct *ws)
{
	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);

	nvme_dev_remove(dev);
2395
	nvme_free_queues(dev, 1);
K
Keith Busch 已提交
2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406
}

static int nvme_dev_resume(struct nvme_dev *dev)
{
	int ret;

	ret = nvme_dev_start(dev);
	if (ret && ret != -EBUSY)
		return ret;
	if (ret == -EBUSY) {
		spin_lock(&dev_list_lock);
M
Matthew Wilcox 已提交
2407
		PREPARE_WORK(&dev->reset_work, nvme_remove_disks);
K
Keith Busch 已提交
2408 2409 2410
		queue_work(nvme_workq, &dev->reset_work);
		spin_unlock(&dev_list_lock);
	}
2411
	dev->initialized = 1;
K
Keith Busch 已提交
2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435
	return 0;
}

static void nvme_dev_reset(struct nvme_dev *dev)
{
	nvme_dev_shutdown(dev);
	if (nvme_dev_resume(dev)) {
		dev_err(&dev->pci_dev->dev, "Device failed to resume\n");
		kref_get(&dev->kref);
		if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
							dev->instance))) {
			dev_err(&dev->pci_dev->dev,
				"Failed to start controller remove task\n");
			kref_put(&dev->kref, nvme_free_dev);
		}
	}
}

static void nvme_reset_failed_dev(struct work_struct *ws)
{
	struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
	nvme_dev_reset(dev);
}

2436
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
M
Matthew Wilcox 已提交
2437
{
2438
	int result = -ENOMEM;
M
Matthew Wilcox 已提交
2439 2440 2441 2442 2443 2444 2445 2446 2447
	struct nvme_dev *dev;

	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	if (!dev)
		return -ENOMEM;
	dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry),
								GFP_KERNEL);
	if (!dev->entry)
		goto free;
M
Matthew Wilcox 已提交
2448 2449
	dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *),
								GFP_KERNEL);
M
Matthew Wilcox 已提交
2450 2451 2452 2453
	if (!dev->queues)
		goto free;

	INIT_LIST_HEAD(&dev->namespaces);
M
Matthew Wilcox 已提交
2454
	INIT_WORK(&dev->reset_work, nvme_reset_failed_dev);
M
Matthew Wilcox 已提交
2455
	dev->pci_dev = pdev;
K
Keith Busch 已提交
2456
	pci_set_drvdata(pdev, dev);
2457 2458
	result = nvme_set_instance(dev);
	if (result)
2459
		goto free;
M
Matthew Wilcox 已提交
2460

M
Matthew Wilcox 已提交
2461 2462
	result = nvme_setup_prp_pools(dev);
	if (result)
2463
		goto release;
M
Matthew Wilcox 已提交
2464

2465
	kref_init(&dev->kref);
2466
	result = nvme_dev_start(dev);
2467 2468 2469
	if (result) {
		if (result == -EBUSY)
			goto create_cdev;
2470
		goto release_pools;
2471
	}
M
Matthew Wilcox 已提交
2472

2473
	result = nvme_dev_add(dev);
2474
	if (result)
2475
		goto shutdown;
2476

2477
 create_cdev:
2478 2479 2480 2481 2482 2483 2484 2485 2486
	scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
	dev->miscdev.parent = &pdev->dev;
	dev->miscdev.name = dev->name;
	dev->miscdev.fops = &nvme_dev_fops;
	result = misc_register(&dev->miscdev);
	if (result)
		goto remove;

2487
	dev->initialized = 1;
M
Matthew Wilcox 已提交
2488 2489
	return 0;

2490 2491
 remove:
	nvme_dev_remove(dev);
2492
	nvme_free_namespaces(dev);
2493 2494
 shutdown:
	nvme_dev_shutdown(dev);
2495
 release_pools:
2496
	nvme_free_queues(dev, 0);
M
Matthew Wilcox 已提交
2497
	nvme_release_prp_pools(dev);
2498 2499
 release:
	nvme_release_instance(dev);
M
Matthew Wilcox 已提交
2500 2501 2502 2503 2504 2505 2506
 free:
	kfree(dev->queues);
	kfree(dev->entry);
	kfree(dev);
	return result;
}

2507 2508 2509 2510 2511 2512
static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
	nvme_dev_shutdown(dev);
}

2513
static void nvme_remove(struct pci_dev *pdev)
M
Matthew Wilcox 已提交
2514 2515
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
K
Keith Busch 已提交
2516 2517 2518 2519 2520 2521 2522

	spin_lock(&dev_list_lock);
	list_del_init(&dev->node);
	spin_unlock(&dev_list_lock);

	pci_set_drvdata(pdev, NULL);
	flush_work(&dev->reset_work);
2523
	misc_deregister(&dev->miscdev);
K
Keith Busch 已提交
2524 2525
	nvme_dev_remove(dev);
	nvme_dev_shutdown(dev);
2526
	nvme_free_queues(dev, 0);
2527
	rcu_barrier();
K
Keith Busch 已提交
2528 2529
	nvme_release_instance(dev);
	nvme_release_prp_pools(dev);
2530
	kref_put(&dev->kref, nvme_free_dev);
M
Matthew Wilcox 已提交
2531 2532 2533 2534 2535 2536 2537 2538
}

/* These functions are yet to be implemented */
#define nvme_error_detected NULL
#define nvme_dump_registers NULL
#define nvme_link_reset NULL
#define nvme_slot_reset NULL
#define nvme_error_resume NULL
2539

2540
#ifdef CONFIG_PM_SLEEP
2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554
static int nvme_suspend(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

	nvme_dev_shutdown(ndev);
	return 0;
}

static int nvme_resume(struct device *dev)
{
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

K
Keith Busch 已提交
2555
	if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
M
Matthew Wilcox 已提交
2556
		PREPARE_WORK(&ndev->reset_work, nvme_reset_failed_dev);
K
Keith Busch 已提交
2557 2558 2559
		queue_work(nvme_workq, &ndev->reset_work);
	}
	return 0;
2560
}
2561
#endif
2562 2563

static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
M
Matthew Wilcox 已提交
2564

2565
static const struct pci_error_handlers nvme_err_handler = {
M
Matthew Wilcox 已提交
2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
	.error_detected	= nvme_error_detected,
	.mmio_enabled	= nvme_dump_registers,
	.link_reset	= nvme_link_reset,
	.slot_reset	= nvme_slot_reset,
	.resume		= nvme_error_resume,
};

/* Move to pci_ids.h later */
#define PCI_CLASS_STORAGE_EXPRESS	0x010802

static DEFINE_PCI_DEVICE_TABLE(nvme_id_table) = {
	{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
	{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);

static struct pci_driver nvme_driver = {
	.name		= "nvme",
	.id_table	= nvme_id_table,
	.probe		= nvme_probe,
2586
	.remove		= nvme_remove,
2587
	.shutdown	= nvme_shutdown,
2588 2589 2590
	.driver		= {
		.pm	= &nvme_dev_pm_ops,
	},
M
Matthew Wilcox 已提交
2591 2592 2593 2594 2595
	.err_handler	= &nvme_err_handler,
};

static int __init nvme_init(void)
{
2596
	int result;
2597 2598 2599 2600

	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
	if (IS_ERR(nvme_thread))
		return PTR_ERR(nvme_thread);
M
Matthew Wilcox 已提交
2601

K
Keith Busch 已提交
2602 2603 2604 2605 2606
	result = -ENOMEM;
	nvme_workq = create_singlethread_workqueue("nvme");
	if (!nvme_workq)
		goto kill_kthread;

2607 2608
	result = register_blkdev(nvme_major, "nvme");
	if (result < 0)
K
Keith Busch 已提交
2609
		goto kill_workq;
2610
	else if (result > 0)
2611
		nvme_major = result;
M
Matthew Wilcox 已提交
2612 2613

	result = pci_register_driver(&nvme_driver);
2614 2615 2616
	if (result)
		goto unregister_blkdev;
	return 0;
M
Matthew Wilcox 已提交
2617

2618
 unregister_blkdev:
M
Matthew Wilcox 已提交
2619
	unregister_blkdev(nvme_major, "nvme");
K
Keith Busch 已提交
2620 2621
 kill_workq:
	destroy_workqueue(nvme_workq);
2622 2623
 kill_kthread:
	kthread_stop(nvme_thread);
M
Matthew Wilcox 已提交
2624 2625 2626 2627 2628 2629 2630
	return result;
}

static void __exit nvme_exit(void)
{
	pci_unregister_driver(&nvme_driver);
	unregister_blkdev(nvme_major, "nvme");
K
Keith Busch 已提交
2631
	destroy_workqueue(nvme_workq);
2632
	kthread_stop(nvme_thread);
M
Matthew Wilcox 已提交
2633 2634 2635 2636
}

MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
MODULE_LICENSE("GPL");
M
Matthew Wilcox 已提交
2637
MODULE_VERSION("0.8");
M
Matthew Wilcox 已提交
2638 2639
module_init(nvme_init);
module_exit(nvme_exit);