pmem.c 13.8 KB
Newer Older
1 2 3
/*
 * Persistent Memory Driver
 *
4
 * Copyright (c) 2014-2015, Intel Corporation.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <asm/cacheflush.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
25
#include <linux/badblocks.h>
D
Dan Williams 已提交
26
#include <linux/memremap.h>
27
#include <linux/vmalloc.h>
D
Dan Williams 已提交
28
#include <linux/pfn_t.h>
29
#include <linux/slab.h>
30
#include <linux/pmem.h>
31
#include <linux/nd.h>
32
#include "pfn.h"
33
#include "nd.h"
34 35 36 37

struct pmem_device {
	struct request_queue	*pmem_queue;
	struct gendisk		*pmem_disk;
38
	struct nd_namespace_common *ndns;
39 40 41

	/* One contiguous memory region per device */
	phys_addr_t		phys_addr;
42 43
	/* when non-zero this device is hosting a 'pfn' instance */
	phys_addr_t		data_offset;
A
Arnd Bergmann 已提交
44
	u64			pfn_flags;
45
	void __pmem		*virt_addr;
46
	size_t			size;
47
	struct badblocks	bb;
48 49 50 51
};

static int pmem_major;

52 53 54 55 56 57 58 59 60 61 62 63 64 65
static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
{
	if (bb->count) {
		sector_t first_bad;
		int num_bad;

		return !!badblocks_check(bb, sector, len / 512, &first_bad,
				&num_bad);
	}

	return false;
}

static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
66 67 68 69
			unsigned int len, unsigned int off, int rw,
			sector_t sector)
{
	void *mem = kmap_atomic(page);
70
	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
71
	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
72 73

	if (rw == READ) {
74 75
		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
			return -EIO;
76
		memcpy_from_pmem(mem + off, pmem_addr, len);
77 78 79
		flush_dcache_page(page);
	} else {
		flush_dcache_page(page);
80
		memcpy_to_pmem(pmem_addr, mem + off, len);
81 82 83
	}

	kunmap_atomic(mem);
84
	return 0;
85 86
}

87
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
88
{
89
	int rc = 0;
D
Dan Williams 已提交
90 91
	bool do_acct;
	unsigned long start;
92 93
	struct bio_vec bvec;
	struct bvec_iter iter;
D
Dan Williams 已提交
94 95
	struct block_device *bdev = bio->bi_bdev;
	struct pmem_device *pmem = bdev->bd_disk->private_data;
96

D
Dan Williams 已提交
97
	do_acct = nd_iostat_start(bio, &start);
98 99 100 101 102 103 104 105 106
	bio_for_each_segment(bvec, bio, iter) {
		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
				bvec.bv_offset, bio_data_dir(bio),
				iter.bi_sector);
		if (rc) {
			bio->bi_error = rc;
			break;
		}
	}
D
Dan Williams 已提交
107 108
	if (do_acct)
		nd_iostat_end(bio, start);
109 110 111 112

	if (bio_data_dir(bio))
		wmb_pmem();

113
	bio_endio(bio);
114
	return BLK_QC_T_NONE;
115 116 117 118 119 120
}

static int pmem_rw_page(struct block_device *bdev, sector_t sector,
		       struct page *page, int rw)
{
	struct pmem_device *pmem = bdev->bd_disk->private_data;
121
	int rc;
122

123
	rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
124 125
	if (rw & WRITE)
		wmb_pmem();
126

127 128 129 130 131 132 133 134 135 136
	/*
	 * The ->rw_page interface is subtle and tricky.  The core
	 * retries on any error, so we can only invoke page_endio() in
	 * the successful completion case.  Otherwise, we'll see crashes
	 * caused by double completion.
	 */
	if (rc == 0)
		page_endio(page, rw & WRITE, 0);

	return rc;
137 138 139
}

static long pmem_direct_access(struct block_device *bdev, sector_t sector,
D
Dan Williams 已提交
140
		      void __pmem **kaddr, pfn_t *pfn)
141 142
{
	struct pmem_device *pmem = bdev->bd_disk->private_data;
143
	resource_size_t offset = sector * 512 + pmem->data_offset;
144

145
	*kaddr = pmem->virt_addr + offset;
D
Dan Williams 已提交
146
	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
147

148
	return pmem->size - offset;
149 150 151 152 153 154
}

static const struct block_device_operations pmem_fops = {
	.owner =		THIS_MODULE,
	.rw_page =		pmem_rw_page,
	.direct_access =	pmem_direct_access,
155
	.revalidate_disk =	nvdimm_revalidate_disk,
156 157
};

158 159
static struct pmem_device *pmem_alloc(struct device *dev,
		struct resource *res, int id)
160 161
{
	struct pmem_device *pmem;
162
	struct request_queue *q;
163

164
	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
165
	if (!pmem)
166
		return ERR_PTR(-ENOMEM);
167 168 169

	pmem->phys_addr = res->start;
	pmem->size = resource_size(res);
170
	if (!arch_has_wmb_pmem())
171
		dev_warn(dev, "unable to guarantee persistence of writes\n");
172

173 174
	if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
			dev_name(dev))) {
175 176
		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
				&pmem->phys_addr, pmem->size);
177
		return ERR_PTR(-EBUSY);
178 179
	}

180 181 182 183
	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
	if (!q)
		return ERR_PTR(-ENOMEM);

D
Dan Williams 已提交
184 185
	pmem->pfn_flags = PFN_DEV;
	if (pmem_should_map_pages(dev)) {
186
		pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
187
				&q->q_usage_counter, NULL);
D
Dan Williams 已提交
188 189
		pmem->pfn_flags |= PFN_MAP;
	} else
D
Dan Williams 已提交
190 191 192
		pmem->virt_addr = (void __pmem *) devm_memremap(dev,
				pmem->phys_addr, pmem->size,
				ARCH_MEMREMAP_PMEM);
193

194 195
	if (IS_ERR(pmem->virt_addr)) {
		blk_cleanup_queue(q);
196
		return (void __force *) pmem->virt_addr;
197
	}
198

199
	pmem->pmem_queue = q;
200 201 202 203 204
	return pmem;
}

static void pmem_detach_disk(struct pmem_device *pmem)
{
205 206 207
	if (!pmem->pmem_disk)
		return;

208 209 210 211 212
	del_gendisk(pmem->pmem_disk);
	put_disk(pmem->pmem_disk);
	blk_cleanup_queue(pmem->pmem_queue);
}

213 214
static int pmem_attach_disk(struct device *dev,
		struct nd_namespace_common *ndns, struct pmem_device *pmem)
215
{
216
	int nid = dev_to_node(dev);
217
	struct gendisk *disk;
218 219

	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
220
	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
221
	blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
222
	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
223
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
224

225
	disk = alloc_disk_node(0, nid);
226 227 228 229
	if (!disk) {
		blk_cleanup_queue(pmem->pmem_queue);
		return -ENOMEM;
	}
230 231

	disk->major		= pmem_major;
232
	disk->first_minor	= 0;
233 234 235 236
	disk->fops		= &pmem_fops;
	disk->private_data	= pmem;
	disk->queue		= pmem->pmem_queue;
	disk->flags		= GENHD_FL_EXT_DEVT;
V
Vishal Verma 已提交
237
	nvdimm_namespace_disk_name(ndns, disk->disk_name);
238 239
	disk->driverfs_dev = dev;
	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
240
	pmem->pmem_disk = disk;
241
	devm_exit_badblocks(dev, &pmem->bb);
242 243 244
	if (devm_init_badblocks(dev, &pmem->bb))
		return -ENOMEM;
	nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
245

246
	disk->bb = &pmem->bb;
247
	add_disk(disk);
248
	revalidate_disk(disk);
249

250 251
	return 0;
}
252

253 254 255 256 257 258 259 260 261 262
static int pmem_rw_bytes(struct nd_namespace_common *ndns,
		resource_size_t offset, void *buf, size_t size, int rw)
{
	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);

	if (unlikely(offset + size > pmem->size)) {
		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
		return -EFAULT;
	}

263 264 265 266 267
	if (rw == READ) {
		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);

		if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
			return -EIO;
268
		memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
269
	} else {
270 271 272
		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
		wmb_pmem();
	}
273 274 275 276

	return 0;
}

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
static int nd_pfn_init(struct nd_pfn *nd_pfn)
{
	struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
	struct nd_namespace_common *ndns = nd_pfn->ndns;
	struct nd_region *nd_region;
	unsigned long npfns;
	phys_addr_t offset;
	u64 checksum;
	int rc;

	if (!pfn_sb)
		return -ENOMEM;

	nd_pfn->pfn_sb = pfn_sb;
	rc = nd_pfn_validate(nd_pfn);
293 294 295
	if (rc == -ENODEV)
		/* no info block, do init */;
	else
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
		return rc;

	nd_region = to_nd_region(nd_pfn->dev.parent);
	if (nd_region->ro) {
		dev_info(&nd_pfn->dev,
				"%s is read-only, unable to init metadata\n",
				dev_name(&nd_region->dev));
		goto err;
	}

	memset(pfn_sb, 0, sizeof(*pfn_sb));
	npfns = (pmem->size - SZ_8K) / SZ_4K;
	/*
	 * Note, we use 64 here for the standard size of struct page,
	 * debugging options may cause it to be larger in which case the
	 * implementation will limit the pfns advertised through
	 * ->direct_access() to those that are included in the memmap.
	 */
	if (nd_pfn->mode == PFN_MODE_PMEM)
315
		offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
316
	else if (nd_pfn->mode == PFN_MODE_RAM)
317
		offset = ALIGN(SZ_8K, nd_pfn->align);
318 319 320 321 322 323 324 325 326
	else
		goto err;

	npfns = (pmem->size - offset) / SZ_4K;
	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
	pfn_sb->dataoff = cpu_to_le64(offset);
	pfn_sb->npfns = cpu_to_le64(npfns);
	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
327
	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358
	pfn_sb->version_major = cpu_to_le16(1);
	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
	pfn_sb->checksum = cpu_to_le64(checksum);

	rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
	if (rc)
		goto err;

	return 0;
 err:
	nd_pfn->pfn_sb = NULL;
	kfree(pfn_sb);
	return -ENXIO;
}

static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
{
	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
	struct pmem_device *pmem;

	/* free pmem disk */
	pmem = dev_get_drvdata(&nd_pfn->dev);
	pmem_detach_disk(pmem);

	/* release nd_pfn resources */
	kfree(nd_pfn->pfn_sb);
	nd_pfn->pfn_sb = NULL;

	return 0;
}

359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
/*
 * We hotplug memory at section granularity, pad the reserved area from
 * the previous section base to the namespace base address.
 */
static unsigned long init_altmap_base(resource_size_t base)
{
	unsigned long base_pfn = __phys_to_pfn(base);

	return PFN_SECTION_ALIGN_DOWN(base_pfn);
}

static unsigned long init_altmap_reserve(resource_size_t base)
{
	unsigned long reserve = __phys_to_pfn(SZ_8K);
	unsigned long base_pfn = __phys_to_pfn(base);

	reserve += base_pfn - PFN_SECTION_ALIGN_DOWN(base_pfn);
	return reserve;
}

379
static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
380
{
381 382 383 384
	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
	struct device *dev = &nd_pfn->dev;
	struct nd_region *nd_region;
385
	struct vmem_altmap *altmap;
386 387
	struct nd_pfn_sb *pfn_sb;
	struct pmem_device *pmem;
388
	struct request_queue *q;
389 390
	phys_addr_t offset;
	int rc;
391
	struct vmem_altmap __altmap = {
392 393
		.base_pfn = init_altmap_base(nsio->res.start),
		.reserve = init_altmap_reserve(nsio->res.start),
394
	};
395 396 397 398 399 400 401 402 403 404 405 406 407

	if (!nd_pfn->uuid || !nd_pfn->ndns)
		return -ENODEV;

	nd_region = to_nd_region(dev->parent);
	rc = nd_pfn_init(nd_pfn);
	if (rc)
		return rc;

	pfn_sb = nd_pfn->pfn_sb;
	offset = le64_to_cpu(pfn_sb->dataoff);
	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
	if (nd_pfn->mode == PFN_MODE_RAM) {
408
		if (offset < SZ_8K)
409 410 411
			return -EINVAL;
		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
		altmap = NULL;
412 413 414 415 416 417 418 419 420 421 422
	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
		nd_pfn->npfns = (resource_size(&nsio->res) - offset)
			/ PAGE_SIZE;
		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
			dev_info(&nd_pfn->dev,
					"number of pfns truncated from %lld to %ld\n",
					le64_to_cpu(nd_pfn->pfn_sb->npfns),
					nd_pfn->npfns);
		altmap = & __altmap;
		altmap->free = __phys_to_pfn(offset - SZ_8K);
		altmap->alloc = 0;
423 424 425 426 427 428 429
	} else {
		rc = -ENXIO;
		goto err;
	}

	/* establish pfn range for lookup, and switch to direct map */
	pmem = dev_get_drvdata(dev);
430
	q = pmem->pmem_queue;
D
Dan Williams 已提交
431
	devm_memunmap(dev, (void __force *) pmem->virt_addr);
432
	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
433
			&q->q_usage_counter, altmap);
D
Dan Williams 已提交
434
	pmem->pfn_flags |= PFN_MAP;
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
	if (IS_ERR(pmem->virt_addr)) {
		rc = PTR_ERR(pmem->virt_addr);
		goto err;
	}

	/* attach pmem disk in "pfn-mode" */
	pmem->data_offset = offset;
	rc = pmem_attach_disk(dev, ndns, pmem);
	if (rc)
		goto err;

	return rc;
 err:
	nvdimm_namespace_detach_pfn(ndns);
	return rc;
450 451
}

452
static int nd_pmem_probe(struct device *dev)
453
{
454
	struct nd_region *nd_region = to_nd_region(dev->parent);
455 456
	struct nd_namespace_common *ndns;
	struct nd_namespace_io *nsio;
457 458
	struct pmem_device *pmem;

459 460 461
	ndns = nvdimm_namespace_common_probe(dev);
	if (IS_ERR(ndns))
		return PTR_ERR(ndns);
462

463
	nsio = to_nd_namespace_io(&ndns->dev);
464
	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
465 466 467
	if (IS_ERR(pmem))
		return PTR_ERR(pmem);

468
	pmem->ndns = ndns;
469
	dev_set_drvdata(dev, pmem);
470
	ndns->rw_bytes = pmem_rw_bytes;
471 472 473
	if (devm_init_badblocks(dev, &pmem->bb))
		return -ENOMEM;
	nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
474

475 476 477 478
	if (is_nd_btt(dev)) {
		/* btt allocates its own request_queue */
		blk_cleanup_queue(pmem->pmem_queue);
		pmem->pmem_queue = NULL;
479
		return nvdimm_namespace_attach_btt(ndns);
480
	}
481

482 483 484
	if (is_nd_pfn(dev))
		return nvdimm_namespace_attach_pfn(ndns);

485 486 487 488 489 490
	if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
		/*
		 * We'll come back as either btt-pmem, or pfn-pmem, so
		 * drop the queue allocation for now.
		 */
		blk_cleanup_queue(pmem->pmem_queue);
491 492 493 494
		return -ENXIO;
	}

	return pmem_attach_disk(dev, ndns, pmem);
495 496
}

497
static int nd_pmem_remove(struct device *dev)
498
{
499
	struct pmem_device *pmem = dev_get_drvdata(dev);
500

501
	if (is_nd_btt(dev))
502 503 504
		nvdimm_namespace_detach_btt(pmem->ndns);
	else if (is_nd_pfn(dev))
		nvdimm_namespace_detach_pfn(pmem->ndns);
505 506 507
	else
		pmem_detach_disk(pmem);

508 509 510
	return 0;
}

511 512
MODULE_ALIAS("pmem");
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
513
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
514 515 516 517 518
static struct nd_device_driver nd_pmem_driver = {
	.probe = nd_pmem_probe,
	.remove = nd_pmem_remove,
	.drv = {
		.name = "nd_pmem",
519
	},
520
	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
521 522 523 524 525 526 527 528 529 530
};

static int __init pmem_init(void)
{
	int error;

	pmem_major = register_blkdev(0, "pmem");
	if (pmem_major < 0)
		return pmem_major;

531 532
	error = nd_driver_register(&nd_pmem_driver);
	if (error) {
533
		unregister_blkdev(pmem_major, "pmem");
534 535 536 537
		return error;
	}

	return 0;
538 539 540 541 542
}
module_init(pmem_init);

static void pmem_exit(void)
{
543
	driver_unregister(&nd_pmem_driver.drv);
544 545 546 547 548 549
	unregister_blkdev(pmem_major, "pmem");
}
module_exit(pmem_exit);

MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
MODULE_LICENSE("GPL v2");