pmem.c 13.1 KB
Newer Older
1 2 3
/*
 * Persistent Memory Driver
 *
4
 * Copyright (c) 2014-2015, Intel Corporation.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <asm/cacheflush.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
25
#include <linux/badblocks.h>
D
Dan Williams 已提交
26
#include <linux/memremap.h>
27
#include <linux/vmalloc.h>
D
Dan Williams 已提交
28
#include <linux/pfn_t.h>
29
#include <linux/slab.h>
30
#include <linux/pmem.h>
31
#include <linux/nd.h>
32
#include "pfn.h"
33
#include "nd.h"
34 35 36 37

struct pmem_device {
	struct request_queue	*pmem_queue;
	struct gendisk		*pmem_disk;
38
	struct nd_namespace_common *ndns;
39 40 41

	/* One contiguous memory region per device */
	phys_addr_t		phys_addr;
42 43
	/* when non-zero this device is hosting a 'pfn' instance */
	phys_addr_t		data_offset;
D
Dan Williams 已提交
44
	unsigned long		pfn_flags;
45
	void __pmem		*virt_addr;
46
	size_t			size;
47
	struct badblocks	bb;
48 49 50 51
};

static int pmem_major;

52 53 54 55 56 57 58 59 60 61 62 63 64 65
static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
{
	if (bb->count) {
		sector_t first_bad;
		int num_bad;

		return !!badblocks_check(bb, sector, len / 512, &first_bad,
				&num_bad);
	}

	return false;
}

static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
66 67 68 69
			unsigned int len, unsigned int off, int rw,
			sector_t sector)
{
	void *mem = kmap_atomic(page);
70
	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
71
	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
72 73

	if (rw == READ) {
74 75
		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
			return -EIO;
76
		memcpy_from_pmem(mem + off, pmem_addr, len);
77 78 79
		flush_dcache_page(page);
	} else {
		flush_dcache_page(page);
80
		memcpy_to_pmem(pmem_addr, mem + off, len);
81 82 83
	}

	kunmap_atomic(mem);
84
	return 0;
85 86
}

87
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
88
{
89
	int rc = 0;
D
Dan Williams 已提交
90 91
	bool do_acct;
	unsigned long start;
92 93
	struct bio_vec bvec;
	struct bvec_iter iter;
D
Dan Williams 已提交
94 95
	struct block_device *bdev = bio->bi_bdev;
	struct pmem_device *pmem = bdev->bd_disk->private_data;
96

D
Dan Williams 已提交
97
	do_acct = nd_iostat_start(bio, &start);
98 99 100 101 102 103 104 105 106
	bio_for_each_segment(bvec, bio, iter) {
		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
				bvec.bv_offset, bio_data_dir(bio),
				iter.bi_sector);
		if (rc) {
			bio->bi_error = rc;
			break;
		}
	}
D
Dan Williams 已提交
107 108
	if (do_acct)
		nd_iostat_end(bio, start);
109 110 111 112

	if (bio_data_dir(bio))
		wmb_pmem();

113
	bio_endio(bio);
114
	return BLK_QC_T_NONE;
115 116 117 118 119 120
}

static int pmem_rw_page(struct block_device *bdev, sector_t sector,
		       struct page *page, int rw)
{
	struct pmem_device *pmem = bdev->bd_disk->private_data;
121
	int rc;
122

123
	rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
124 125
	if (rw & WRITE)
		wmb_pmem();
126

127 128 129 130 131 132 133 134 135 136
	/*
	 * The ->rw_page interface is subtle and tricky.  The core
	 * retries on any error, so we can only invoke page_endio() in
	 * the successful completion case.  Otherwise, we'll see crashes
	 * caused by double completion.
	 */
	if (rc == 0)
		page_endio(page, rw & WRITE, 0);

	return rc;
137 138 139
}

static long pmem_direct_access(struct block_device *bdev, sector_t sector,
D
Dan Williams 已提交
140
		      void __pmem **kaddr, pfn_t *pfn)
141 142
{
	struct pmem_device *pmem = bdev->bd_disk->private_data;
143
	resource_size_t offset = sector * 512 + pmem->data_offset;
144

145
	*kaddr = pmem->virt_addr + offset;
D
Dan Williams 已提交
146
	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
147

148
	return pmem->size - offset;
149 150 151 152 153 154
}

static const struct block_device_operations pmem_fops = {
	.owner =		THIS_MODULE,
	.rw_page =		pmem_rw_page,
	.direct_access =	pmem_direct_access,
155
	.revalidate_disk =	nvdimm_revalidate_disk,
156 157
};

158 159
static struct pmem_device *pmem_alloc(struct device *dev,
		struct resource *res, int id)
160 161
{
	struct pmem_device *pmem;
162
	struct request_queue *q;
163

164
	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
165
	if (!pmem)
166
		return ERR_PTR(-ENOMEM);
167 168 169

	pmem->phys_addr = res->start;
	pmem->size = resource_size(res);
170
	if (!arch_has_wmb_pmem())
171
		dev_warn(dev, "unable to guarantee persistence of writes\n");
172

173 174
	if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
			dev_name(dev))) {
175 176
		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
				&pmem->phys_addr, pmem->size);
177
		return ERR_PTR(-EBUSY);
178 179
	}

180 181 182 183
	q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev));
	if (!q)
		return ERR_PTR(-ENOMEM);

D
Dan Williams 已提交
184 185
	pmem->pfn_flags = PFN_DEV;
	if (pmem_should_map_pages(dev)) {
186 187
		pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
				NULL);
D
Dan Williams 已提交
188 189
		pmem->pfn_flags |= PFN_MAP;
	} else
D
Dan Williams 已提交
190 191 192
		pmem->virt_addr = (void __pmem *) devm_memremap(dev,
				pmem->phys_addr, pmem->size,
				ARCH_MEMREMAP_PMEM);
193

194 195
	if (IS_ERR(pmem->virt_addr)) {
		blk_cleanup_queue(q);
196
		return (void __force *) pmem->virt_addr;
197
	}
198

199
	pmem->pmem_queue = q;
200 201 202 203 204
	return pmem;
}

static void pmem_detach_disk(struct pmem_device *pmem)
{
205 206 207
	if (!pmem->pmem_disk)
		return;

208 209 210 211 212
	del_gendisk(pmem->pmem_disk);
	put_disk(pmem->pmem_disk);
	blk_cleanup_queue(pmem->pmem_queue);
}

213 214
static int pmem_attach_disk(struct device *dev,
		struct nd_namespace_common *ndns, struct pmem_device *pmem)
215
{
216
	int nid = dev_to_node(dev);
217
	struct gendisk *disk;
218 219

	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
220
	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
221
	blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
222
	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
223
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
224

225
	disk = alloc_disk_node(0, nid);
226 227 228 229
	if (!disk) {
		blk_cleanup_queue(pmem->pmem_queue);
		return -ENOMEM;
	}
230 231

	disk->major		= pmem_major;
232
	disk->first_minor	= 0;
233 234 235 236
	disk->fops		= &pmem_fops;
	disk->private_data	= pmem;
	disk->queue		= pmem->pmem_queue;
	disk->flags		= GENHD_FL_EXT_DEVT;
V
Vishal Verma 已提交
237
	nvdimm_namespace_disk_name(ndns, disk->disk_name);
238 239
	disk->driverfs_dev = dev;
	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
240
	pmem->pmem_disk = disk;
241
	devm_exit_badblocks(dev, &pmem->bb);
242 243 244
	if (devm_init_badblocks(dev, &pmem->bb))
		return -ENOMEM;
	nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
245

246
	disk->bb = &pmem->bb;
247
	add_disk(disk);
248
	revalidate_disk(disk);
249

250 251
	return 0;
}
252

253 254 255 256 257 258 259 260 261 262
static int pmem_rw_bytes(struct nd_namespace_common *ndns,
		resource_size_t offset, void *buf, size_t size, int rw)
{
	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);

	if (unlikely(offset + size > pmem->size)) {
		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
		return -EFAULT;
	}

263 264 265 266 267
	if (rw == READ) {
		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);

		if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
			return -EIO;
268
		memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
269
	} else {
270 271 272
		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
		wmb_pmem();
	}
273 274 275 276

	return 0;
}

277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
static int nd_pfn_init(struct nd_pfn *nd_pfn)
{
	struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
	struct nd_namespace_common *ndns = nd_pfn->ndns;
	struct nd_region *nd_region;
	unsigned long npfns;
	phys_addr_t offset;
	u64 checksum;
	int rc;

	if (!pfn_sb)
		return -ENOMEM;

	nd_pfn->pfn_sb = pfn_sb;
	rc = nd_pfn_validate(nd_pfn);
293 294 295
	if (rc == -ENODEV)
		/* no info block, do init */;
	else
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
		return rc;

	nd_region = to_nd_region(nd_pfn->dev.parent);
	if (nd_region->ro) {
		dev_info(&nd_pfn->dev,
				"%s is read-only, unable to init metadata\n",
				dev_name(&nd_region->dev));
		goto err;
	}

	memset(pfn_sb, 0, sizeof(*pfn_sb));
	npfns = (pmem->size - SZ_8K) / SZ_4K;
	/*
	 * Note, we use 64 here for the standard size of struct page,
	 * debugging options may cause it to be larger in which case the
	 * implementation will limit the pfns advertised through
	 * ->direct_access() to those that are included in the memmap.
	 */
	if (nd_pfn->mode == PFN_MODE_PMEM)
315
		offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
316
	else if (nd_pfn->mode == PFN_MODE_RAM)
317
		offset = ALIGN(SZ_8K, nd_pfn->align);
318 319 320 321 322 323 324 325 326
	else
		goto err;

	npfns = (pmem->size - offset) / SZ_4K;
	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
	pfn_sb->dataoff = cpu_to_le64(offset);
	pfn_sb->npfns = cpu_to_le64(npfns);
	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
327
	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
	pfn_sb->version_major = cpu_to_le16(1);
	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
	pfn_sb->checksum = cpu_to_le64(checksum);

	rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
	if (rc)
		goto err;

	return 0;
 err:
	nd_pfn->pfn_sb = NULL;
	kfree(pfn_sb);
	return -ENXIO;
}

static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
{
	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
	struct pmem_device *pmem;

	/* free pmem disk */
	pmem = dev_get_drvdata(&nd_pfn->dev);
	pmem_detach_disk(pmem);

	/* release nd_pfn resources */
	kfree(nd_pfn->pfn_sb);
	nd_pfn->pfn_sb = NULL;

	return 0;
}

static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
360
{
361 362 363 364
	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
	struct device *dev = &nd_pfn->dev;
	struct nd_region *nd_region;
365
	struct vmem_altmap *altmap;
366 367 368 369
	struct nd_pfn_sb *pfn_sb;
	struct pmem_device *pmem;
	phys_addr_t offset;
	int rc;
370 371 372 373
	struct vmem_altmap __altmap = {
		.base_pfn = __phys_to_pfn(nsio->res.start),
		.reserve = __phys_to_pfn(SZ_8K),
	};
374 375 376 377 378 379 380 381 382 383 384 385 386

	if (!nd_pfn->uuid || !nd_pfn->ndns)
		return -ENODEV;

	nd_region = to_nd_region(dev->parent);
	rc = nd_pfn_init(nd_pfn);
	if (rc)
		return rc;

	pfn_sb = nd_pfn->pfn_sb;
	offset = le64_to_cpu(pfn_sb->dataoff);
	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
	if (nd_pfn->mode == PFN_MODE_RAM) {
387
		if (offset < SZ_8K)
388 389 390
			return -EINVAL;
		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
		altmap = NULL;
391 392 393 394 395 396 397 398 399 400 401
	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
		nd_pfn->npfns = (resource_size(&nsio->res) - offset)
			/ PAGE_SIZE;
		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
			dev_info(&nd_pfn->dev,
					"number of pfns truncated from %lld to %ld\n",
					le64_to_cpu(nd_pfn->pfn_sb->npfns),
					nd_pfn->npfns);
		altmap = & __altmap;
		altmap->free = __phys_to_pfn(offset - SZ_8K);
		altmap->alloc = 0;
402 403 404 405 406 407 408
	} else {
		rc = -ENXIO;
		goto err;
	}

	/* establish pfn range for lookup, and switch to direct map */
	pmem = dev_get_drvdata(dev);
D
Dan Williams 已提交
409
	devm_memunmap(dev, (void __force *) pmem->virt_addr);
410
	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
411
			altmap);
D
Dan Williams 已提交
412
	pmem->pfn_flags |= PFN_MAP;
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
	if (IS_ERR(pmem->virt_addr)) {
		rc = PTR_ERR(pmem->virt_addr);
		goto err;
	}

	/* attach pmem disk in "pfn-mode" */
	pmem->data_offset = offset;
	rc = pmem_attach_disk(dev, ndns, pmem);
	if (rc)
		goto err;

	return rc;
 err:
	nvdimm_namespace_detach_pfn(ndns);
	return rc;
428 429
}

430
static int nd_pmem_probe(struct device *dev)
431
{
432
	struct nd_region *nd_region = to_nd_region(dev->parent);
433 434
	struct nd_namespace_common *ndns;
	struct nd_namespace_io *nsio;
435 436
	struct pmem_device *pmem;

437 438 439
	ndns = nvdimm_namespace_common_probe(dev);
	if (IS_ERR(ndns))
		return PTR_ERR(ndns);
440

441
	nsio = to_nd_namespace_io(&ndns->dev);
442
	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
443 444 445
	if (IS_ERR(pmem))
		return PTR_ERR(pmem);

446
	pmem->ndns = ndns;
447
	dev_set_drvdata(dev, pmem);
448
	ndns->rw_bytes = pmem_rw_bytes;
449 450 451
	if (devm_init_badblocks(dev, &pmem->bb))
		return -ENOMEM;
	nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
452

453 454 455 456
	if (is_nd_btt(dev)) {
		/* btt allocates its own request_queue */
		blk_cleanup_queue(pmem->pmem_queue);
		pmem->pmem_queue = NULL;
457
		return nvdimm_namespace_attach_btt(ndns);
458
	}
459

460 461 462
	if (is_nd_pfn(dev))
		return nvdimm_namespace_attach_pfn(ndns);

463 464 465 466 467 468
	if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) {
		/*
		 * We'll come back as either btt-pmem, or pfn-pmem, so
		 * drop the queue allocation for now.
		 */
		blk_cleanup_queue(pmem->pmem_queue);
469 470 471 472
		return -ENXIO;
	}

	return pmem_attach_disk(dev, ndns, pmem);
473 474
}

475
static int nd_pmem_remove(struct device *dev)
476
{
477
	struct pmem_device *pmem = dev_get_drvdata(dev);
478

479
	if (is_nd_btt(dev))
480 481 482
		nvdimm_namespace_detach_btt(pmem->ndns);
	else if (is_nd_pfn(dev))
		nvdimm_namespace_detach_pfn(pmem->ndns);
483 484 485
	else
		pmem_detach_disk(pmem);

486 487 488
	return 0;
}

489 490
MODULE_ALIAS("pmem");
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
491
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
492 493 494 495 496
static struct nd_device_driver nd_pmem_driver = {
	.probe = nd_pmem_probe,
	.remove = nd_pmem_remove,
	.drv = {
		.name = "nd_pmem",
497
	},
498
	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
499 500 501 502 503 504 505 506 507 508
};

static int __init pmem_init(void)
{
	int error;

	pmem_major = register_blkdev(0, "pmem");
	if (pmem_major < 0)
		return pmem_major;

509 510
	error = nd_driver_register(&nd_pmem_driver);
	if (error) {
511
		unregister_blkdev(pmem_major, "pmem");
512 513 514 515
		return error;
	}

	return 0;
516 517 518 519 520
}
module_init(pmem_init);

static void pmem_exit(void)
{
521
	driver_unregister(&nd_pmem_driver.drv);
522 523 524 525 526 527
	unregister_blkdev(pmem_major, "pmem");
}
module_exit(pmem_exit);

MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
MODULE_LICENSE("GPL v2");