pmem.c 12.9 KB
Newer Older
1 2 3
/*
 * Persistent Memory Driver
 *
4
 * Copyright (c) 2014-2015, Intel Corporation.
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
 * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 */

#include <asm/cacheflush.h>
#include <linux/blkdev.h>
#include <linux/hdreg.h>
#include <linux/init.h>
#include <linux/platform_device.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
25
#include <linux/badblocks.h>
D
Dan Williams 已提交
26
#include <linux/memremap.h>
27
#include <linux/vmalloc.h>
D
Dan Williams 已提交
28
#include <linux/pfn_t.h>
29
#include <linux/slab.h>
30
#include <linux/pmem.h>
31
#include <linux/nd.h>
32
#include "pfn.h"
33
#include "nd.h"
34 35 36 37

struct pmem_device {
	struct request_queue	*pmem_queue;
	struct gendisk		*pmem_disk;
38
	struct nd_namespace_common *ndns;
39 40 41

	/* One contiguous memory region per device */
	phys_addr_t		phys_addr;
42 43
	/* when non-zero this device is hosting a 'pfn' instance */
	phys_addr_t		data_offset;
D
Dan Williams 已提交
44
	unsigned long		pfn_flags;
45
	void __pmem		*virt_addr;
46
	size_t			size;
47
	struct badblocks	bb;
48 49 50 51
};

static int pmem_major;

52 53 54 55 56 57 58 59 60 61 62 63 64 65
static bool is_bad_pmem(struct badblocks *bb, sector_t sector, unsigned int len)
{
	if (bb->count) {
		sector_t first_bad;
		int num_bad;

		return !!badblocks_check(bb, sector, len / 512, &first_bad,
				&num_bad);
	}

	return false;
}

static int pmem_do_bvec(struct pmem_device *pmem, struct page *page,
66 67 68 69
			unsigned int len, unsigned int off, int rw,
			sector_t sector)
{
	void *mem = kmap_atomic(page);
70
	phys_addr_t pmem_off = sector * 512 + pmem->data_offset;
71
	void __pmem *pmem_addr = pmem->virt_addr + pmem_off;
72 73

	if (rw == READ) {
74 75
		if (unlikely(is_bad_pmem(&pmem->bb, sector, len)))
			return -EIO;
76
		memcpy_from_pmem(mem + off, pmem_addr, len);
77 78 79
		flush_dcache_page(page);
	} else {
		flush_dcache_page(page);
80
		memcpy_to_pmem(pmem_addr, mem + off, len);
81 82 83
	}

	kunmap_atomic(mem);
84
	return 0;
85 86
}

87
static blk_qc_t pmem_make_request(struct request_queue *q, struct bio *bio)
88
{
89
	int rc = 0;
D
Dan Williams 已提交
90 91
	bool do_acct;
	unsigned long start;
92 93
	struct bio_vec bvec;
	struct bvec_iter iter;
D
Dan Williams 已提交
94 95
	struct block_device *bdev = bio->bi_bdev;
	struct pmem_device *pmem = bdev->bd_disk->private_data;
96

D
Dan Williams 已提交
97
	do_acct = nd_iostat_start(bio, &start);
98 99 100 101 102 103 104 105 106
	bio_for_each_segment(bvec, bio, iter) {
		rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len,
				bvec.bv_offset, bio_data_dir(bio),
				iter.bi_sector);
		if (rc) {
			bio->bi_error = rc;
			break;
		}
	}
D
Dan Williams 已提交
107 108
	if (do_acct)
		nd_iostat_end(bio, start);
109 110 111 112

	if (bio_data_dir(bio))
		wmb_pmem();

113
	bio_endio(bio);
114
	return BLK_QC_T_NONE;
115 116 117 118 119 120
}

static int pmem_rw_page(struct block_device *bdev, sector_t sector,
		       struct page *page, int rw)
{
	struct pmem_device *pmem = bdev->bd_disk->private_data;
121
	int rc;
122

123
	rc = pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
124 125
	if (rw & WRITE)
		wmb_pmem();
126

127 128 129 130 131 132 133 134 135 136
	/*
	 * The ->rw_page interface is subtle and tricky.  The core
	 * retries on any error, so we can only invoke page_endio() in
	 * the successful completion case.  Otherwise, we'll see crashes
	 * caused by double completion.
	 */
	if (rc == 0)
		page_endio(page, rw & WRITE, 0);

	return rc;
137 138 139
}

static long pmem_direct_access(struct block_device *bdev, sector_t sector,
D
Dan Williams 已提交
140
		      void __pmem **kaddr, pfn_t *pfn)
141 142
{
	struct pmem_device *pmem = bdev->bd_disk->private_data;
143
	resource_size_t offset = sector * 512 + pmem->data_offset;
144

145
	*kaddr = pmem->virt_addr + offset;
D
Dan Williams 已提交
146
	*pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags);
147

148
	return pmem->size - offset;
149 150 151 152 153 154
}

static const struct block_device_operations pmem_fops = {
	.owner =		THIS_MODULE,
	.rw_page =		pmem_rw_page,
	.direct_access =	pmem_direct_access,
155
	.revalidate_disk =	nvdimm_revalidate_disk,
156 157
};

158 159
static struct pmem_device *pmem_alloc(struct device *dev,
		struct resource *res, int id)
160 161 162
{
	struct pmem_device *pmem;

163
	pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL);
164
	if (!pmem)
165
		return ERR_PTR(-ENOMEM);
166 167 168

	pmem->phys_addr = res->start;
	pmem->size = resource_size(res);
169
	if (!arch_has_wmb_pmem())
170
		dev_warn(dev, "unable to guarantee persistence of writes\n");
171

172 173
	if (!devm_request_mem_region(dev, pmem->phys_addr, pmem->size,
			dev_name(dev))) {
174 175
		dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n",
				&pmem->phys_addr, pmem->size);
176
		return ERR_PTR(-EBUSY);
177 178
	}

D
Dan Williams 已提交
179 180
	pmem->pfn_flags = PFN_DEV;
	if (pmem_should_map_pages(dev)) {
181 182
		pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res,
				NULL);
D
Dan Williams 已提交
183 184
		pmem->pfn_flags |= PFN_MAP;
	} else
D
Dan Williams 已提交
185 186 187
		pmem->virt_addr = (void __pmem *) devm_memremap(dev,
				pmem->phys_addr, pmem->size,
				ARCH_MEMREMAP_PMEM);
188 189 190

	if (IS_ERR(pmem->virt_addr))
		return (void __force *) pmem->virt_addr;
191 192 193 194 195 196

	return pmem;
}

static void pmem_detach_disk(struct pmem_device *pmem)
{
197 198 199
	if (!pmem->pmem_disk)
		return;

200 201 202 203 204
	del_gendisk(pmem->pmem_disk);
	put_disk(pmem->pmem_disk);
	blk_cleanup_queue(pmem->pmem_queue);
}

205 206
static int pmem_attach_disk(struct device *dev,
		struct nd_namespace_common *ndns, struct pmem_device *pmem)
207
{
208
	int nid = dev_to_node(dev);
209
	struct gendisk *disk;
210

211
	pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid);
212
	if (!pmem->pmem_queue)
213
		return -ENOMEM;
214 215

	blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
216
	blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE);
217
	blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX);
218
	blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
219
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, pmem->pmem_queue);
220

221
	disk = alloc_disk_node(0, nid);
222 223 224 225
	if (!disk) {
		blk_cleanup_queue(pmem->pmem_queue);
		return -ENOMEM;
	}
226 227

	disk->major		= pmem_major;
228
	disk->first_minor	= 0;
229 230 231 232
	disk->fops		= &pmem_fops;
	disk->private_data	= pmem;
	disk->queue		= pmem->pmem_queue;
	disk->flags		= GENHD_FL_EXT_DEVT;
V
Vishal Verma 已提交
233
	nvdimm_namespace_disk_name(ndns, disk->disk_name);
234 235
	disk->driverfs_dev = dev;
	set_capacity(disk, (pmem->size - pmem->data_offset) / 512);
236
	pmem->pmem_disk = disk;
237
	devm_exit_badblocks(dev, &pmem->bb);
238 239 240
	if (devm_init_badblocks(dev, &pmem->bb))
		return -ENOMEM;
	nvdimm_namespace_add_poison(ndns, &pmem->bb, pmem->data_offset);
241

242
	disk->bb = &pmem->bb;
243
	add_disk(disk);
244
	revalidate_disk(disk);
245

246 247
	return 0;
}
248

249 250 251 252 253 254 255 256 257 258
static int pmem_rw_bytes(struct nd_namespace_common *ndns,
		resource_size_t offset, void *buf, size_t size, int rw)
{
	struct pmem_device *pmem = dev_get_drvdata(ndns->claim);

	if (unlikely(offset + size > pmem->size)) {
		dev_WARN_ONCE(&ndns->dev, 1, "request out of range\n");
		return -EFAULT;
	}

259 260 261 262 263
	if (rw == READ) {
		unsigned int sz_align = ALIGN(size + (offset & (512 - 1)), 512);

		if (unlikely(is_bad_pmem(&pmem->bb, offset / 512, sz_align)))
			return -EIO;
264
		memcpy_from_pmem(buf, pmem->virt_addr + offset, size);
265
	} else {
266 267 268
		memcpy_to_pmem(pmem->virt_addr + offset, buf, size);
		wmb_pmem();
	}
269 270 271 272

	return 0;
}

273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
static int nd_pfn_init(struct nd_pfn *nd_pfn)
{
	struct nd_pfn_sb *pfn_sb = kzalloc(sizeof(*pfn_sb), GFP_KERNEL);
	struct pmem_device *pmem = dev_get_drvdata(&nd_pfn->dev);
	struct nd_namespace_common *ndns = nd_pfn->ndns;
	struct nd_region *nd_region;
	unsigned long npfns;
	phys_addr_t offset;
	u64 checksum;
	int rc;

	if (!pfn_sb)
		return -ENOMEM;

	nd_pfn->pfn_sb = pfn_sb;
	rc = nd_pfn_validate(nd_pfn);
289 290 291
	if (rc == -ENODEV)
		/* no info block, do init */;
	else
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
		return rc;

	nd_region = to_nd_region(nd_pfn->dev.parent);
	if (nd_region->ro) {
		dev_info(&nd_pfn->dev,
				"%s is read-only, unable to init metadata\n",
				dev_name(&nd_region->dev));
		goto err;
	}

	memset(pfn_sb, 0, sizeof(*pfn_sb));
	npfns = (pmem->size - SZ_8K) / SZ_4K;
	/*
	 * Note, we use 64 here for the standard size of struct page,
	 * debugging options may cause it to be larger in which case the
	 * implementation will limit the pfns advertised through
	 * ->direct_access() to those that are included in the memmap.
	 */
	if (nd_pfn->mode == PFN_MODE_PMEM)
311
		offset = ALIGN(SZ_8K + 64 * npfns, nd_pfn->align);
312
	else if (nd_pfn->mode == PFN_MODE_RAM)
313
		offset = ALIGN(SZ_8K, nd_pfn->align);
314 315 316 317 318 319 320 321 322
	else
		goto err;

	npfns = (pmem->size - offset) / SZ_4K;
	pfn_sb->mode = cpu_to_le32(nd_pfn->mode);
	pfn_sb->dataoff = cpu_to_le64(offset);
	pfn_sb->npfns = cpu_to_le64(npfns);
	memcpy(pfn_sb->signature, PFN_SIG, PFN_SIG_LEN);
	memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
323
	memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(&ndns->dev), 16);
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
	pfn_sb->version_major = cpu_to_le16(1);
	checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
	pfn_sb->checksum = cpu_to_le64(checksum);

	rc = nvdimm_write_bytes(ndns, SZ_4K, pfn_sb, sizeof(*pfn_sb));
	if (rc)
		goto err;

	return 0;
 err:
	nd_pfn->pfn_sb = NULL;
	kfree(pfn_sb);
	return -ENXIO;
}

static int nvdimm_namespace_detach_pfn(struct nd_namespace_common *ndns)
{
	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
	struct pmem_device *pmem;

	/* free pmem disk */
	pmem = dev_get_drvdata(&nd_pfn->dev);
	pmem_detach_disk(pmem);

	/* release nd_pfn resources */
	kfree(nd_pfn->pfn_sb);
	nd_pfn->pfn_sb = NULL;

	return 0;
}

static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns)
356
{
357 358 359 360
	struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
	struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim);
	struct device *dev = &nd_pfn->dev;
	struct nd_region *nd_region;
361
	struct vmem_altmap *altmap;
362 363 364 365
	struct nd_pfn_sb *pfn_sb;
	struct pmem_device *pmem;
	phys_addr_t offset;
	int rc;
366 367 368 369
	struct vmem_altmap __altmap = {
		.base_pfn = __phys_to_pfn(nsio->res.start),
		.reserve = __phys_to_pfn(SZ_8K),
	};
370 371 372 373 374 375 376 377 378 379 380 381 382

	if (!nd_pfn->uuid || !nd_pfn->ndns)
		return -ENODEV;

	nd_region = to_nd_region(dev->parent);
	rc = nd_pfn_init(nd_pfn);
	if (rc)
		return rc;

	pfn_sb = nd_pfn->pfn_sb;
	offset = le64_to_cpu(pfn_sb->dataoff);
	nd_pfn->mode = le32_to_cpu(nd_pfn->pfn_sb->mode);
	if (nd_pfn->mode == PFN_MODE_RAM) {
383
		if (offset < SZ_8K)
384 385 386
			return -EINVAL;
		nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns);
		altmap = NULL;
387 388 389 390 391 392 393 394 395 396 397
	} else if (nd_pfn->mode == PFN_MODE_PMEM) {
		nd_pfn->npfns = (resource_size(&nsio->res) - offset)
			/ PAGE_SIZE;
		if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns)
			dev_info(&nd_pfn->dev,
					"number of pfns truncated from %lld to %ld\n",
					le64_to_cpu(nd_pfn->pfn_sb->npfns),
					nd_pfn->npfns);
		altmap = & __altmap;
		altmap->free = __phys_to_pfn(offset - SZ_8K);
		altmap->alloc = 0;
398 399 400 401 402 403 404
	} else {
		rc = -ENXIO;
		goto err;
	}

	/* establish pfn range for lookup, and switch to direct map */
	pmem = dev_get_drvdata(dev);
D
Dan Williams 已提交
405
	devm_memunmap(dev, (void __force *) pmem->virt_addr);
406
	pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res,
407
			altmap);
D
Dan Williams 已提交
408
	pmem->pfn_flags |= PFN_MAP;
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423
	if (IS_ERR(pmem->virt_addr)) {
		rc = PTR_ERR(pmem->virt_addr);
		goto err;
	}

	/* attach pmem disk in "pfn-mode" */
	pmem->data_offset = offset;
	rc = pmem_attach_disk(dev, ndns, pmem);
	if (rc)
		goto err;

	return rc;
 err:
	nvdimm_namespace_detach_pfn(ndns);
	return rc;
424 425
}

426
static int nd_pmem_probe(struct device *dev)
427
{
428
	struct nd_region *nd_region = to_nd_region(dev->parent);
429 430
	struct nd_namespace_common *ndns;
	struct nd_namespace_io *nsio;
431 432
	struct pmem_device *pmem;

433 434 435
	ndns = nvdimm_namespace_common_probe(dev);
	if (IS_ERR(ndns))
		return PTR_ERR(ndns);
436

437
	nsio = to_nd_namespace_io(&ndns->dev);
438
	pmem = pmem_alloc(dev, &nsio->res, nd_region->id);
439 440 441
	if (IS_ERR(pmem))
		return PTR_ERR(pmem);

442
	pmem->ndns = ndns;
443
	dev_set_drvdata(dev, pmem);
444
	ndns->rw_bytes = pmem_rw_bytes;
445 446 447
	if (devm_init_badblocks(dev, &pmem->bb))
		return -ENOMEM;
	nvdimm_namespace_add_poison(ndns, &pmem->bb, 0);
448

449
	if (is_nd_btt(dev))
450 451
		return nvdimm_namespace_attach_btt(ndns);

452 453 454 455
	if (is_nd_pfn(dev))
		return nvdimm_namespace_attach_pfn(ndns);

	if (nd_btt_probe(ndns, pmem) == 0) {
456
		/* we'll come back as btt-pmem */
457
		return -ENXIO;
458 459 460 461 462 463 464 465
	}

	if (nd_pfn_probe(ndns, pmem) == 0) {
		/* we'll come back as pfn-pmem */
		return -ENXIO;
	}

	return pmem_attach_disk(dev, ndns, pmem);
466 467
}

468
static int nd_pmem_remove(struct device *dev)
469
{
470
	struct pmem_device *pmem = dev_get_drvdata(dev);
471

472
	if (is_nd_btt(dev))
473 474 475
		nvdimm_namespace_detach_btt(pmem->ndns);
	else if (is_nd_pfn(dev))
		nvdimm_namespace_detach_pfn(pmem->ndns);
476 477 478
	else
		pmem_detach_disk(pmem);

479 480 481
	return 0;
}

482 483
MODULE_ALIAS("pmem");
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_IO);
484
MODULE_ALIAS_ND_DEVICE(ND_DEVICE_NAMESPACE_PMEM);
485 486 487 488 489
static struct nd_device_driver nd_pmem_driver = {
	.probe = nd_pmem_probe,
	.remove = nd_pmem_remove,
	.drv = {
		.name = "nd_pmem",
490
	},
491
	.type = ND_DRIVER_NAMESPACE_IO | ND_DRIVER_NAMESPACE_PMEM,
492 493 494 495 496 497 498 499 500 501
};

static int __init pmem_init(void)
{
	int error;

	pmem_major = register_blkdev(0, "pmem");
	if (pmem_major < 0)
		return pmem_major;

502 503
	error = nd_driver_register(&nd_pmem_driver);
	if (error) {
504
		unregister_blkdev(pmem_major, "pmem");
505 506 507 508
		return error;
	}

	return 0;
509 510 511 512 513
}
module_init(pmem_init);

static void pmem_exit(void)
{
514
	driver_unregister(&nd_pmem_driver.drv);
515 516 517 518 519 520
	unregister_blkdev(pmem_major, "pmem");
}
module_exit(pmem_exit);

MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
MODULE_LICENSE("GPL v2");