brd.c 13.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
N
Nick Piggin 已提交
2 3 4 5 6 7 8 9 10 11 12
/*
 * Ram backed block device driver.
 *
 * Copyright (C) 2007 Nick Piggin
 * Copyright (C) 2007 Novell Inc.
 *
 * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
 * of their respective owners.
 */

#include <linux/init.h>
B
Bart Van Assche 已提交
13
#include <linux/initrd.h>
N
Nick Piggin 已提交
14 15 16 17 18 19
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/highmem.h>
20
#include <linux/mutex.h>
N
Nick Piggin 已提交
21
#include <linux/radix-tree.h>
A
Al Viro 已提交
22
#include <linux/fs.h>
23
#include <linux/slab.h>
24
#include <linux/backing-dev.h>
N
Nick Piggin 已提交
25

26
#include <linux/uaccess.h>
N
Nick Piggin 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90

#define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
#define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)

/*
 * Each block ramdisk device has a radix_tree brd_pages of pages that stores
 * the pages containing the block device's contents. A brd page's ->index is
 * its offset in PAGE_SIZE units. This is similar to, but in no way connected
 * with, the kernel's pagecache or buffer cache (which sit above our block
 * device).
 */
struct brd_device {
	int		brd_number;

	struct request_queue	*brd_queue;
	struct gendisk		*brd_disk;
	struct list_head	brd_list;

	/*
	 * Backing store of pages and lock to protect it. This is the contents
	 * of the block device.
	 */
	spinlock_t		brd_lock;
	struct radix_tree_root	brd_pages;
};

/*
 * Look up and return a brd's page for a given sector.
 */
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
	pgoff_t idx;
	struct page *page;

	/*
	 * The page lifetime is protected by the fact that we have opened the
	 * device node -- brd pages will never be deleted under us, so we
	 * don't need any further locking or refcounting.
	 *
	 * This is strictly true for the radix-tree nodes as well (ie. we
	 * don't actually need the rcu_read_lock()), however that is not a
	 * documented feature of the radix-tree API so it is better to be
	 * safe here (we don't have total exclusion from radix tree updates
	 * here, only deletes).
	 */
	rcu_read_lock();
	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
	page = radix_tree_lookup(&brd->brd_pages, idx);
	rcu_read_unlock();

	BUG_ON(page && page->index != idx);

	return page;
}

/*
 * Look up and return a brd's page for a given sector.
 * If one does not exist, allocate an empty page, and insert that. Then
 * return it.
 */
static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
{
	pgoff_t idx;
	struct page *page;
N
Nick Piggin 已提交
91
	gfp_t gfp_flags;
N
Nick Piggin 已提交
92 93 94 95 96 97 98 99 100

	page = brd_lookup_page(brd, sector);
	if (page)
		return page;

	/*
	 * Must use NOIO because we don't want to recurse back into the
	 * block or filesystem layers from page reclaim.
	 */
101
	gfp_flags = GFP_NOIO | __GFP_ZERO | __GFP_HIGHMEM;
P
Petr Tesarik 已提交
102
	page = alloc_page(gfp_flags);
N
Nick Piggin 已提交
103 104 105 106 107 108 109 110 111 112
	if (!page)
		return NULL;

	if (radix_tree_preload(GFP_NOIO)) {
		__free_page(page);
		return NULL;
	}

	spin_lock(&brd->brd_lock);
	idx = sector >> PAGE_SECTORS_SHIFT;
113
	page->index = idx;
N
Nick Piggin 已提交
114 115 116 117 118
	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
		__free_page(page);
		page = radix_tree_lookup(&brd->brd_pages, idx);
		BUG_ON(!page);
		BUG_ON(page->index != idx);
119
	}
N
Nick Piggin 已提交
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
	spin_unlock(&brd->brd_lock);

	radix_tree_preload_end();

	return page;
}

/*
 * Free all backing store pages and radix tree. This must only be called when
 * there are no other users of the device.
 */
#define FREE_BATCH 16
static void brd_free_pages(struct brd_device *brd)
{
	unsigned long pos = 0;
	struct page *pages[FREE_BATCH];
	int nr_pages;

	do {
		int i;

		nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
				(void **)pages, pos, FREE_BATCH);

		for (i = 0; i < nr_pages; i++) {
			void *ret;

			BUG_ON(pages[i]->index < pos);
			pos = pages[i]->index;
			ret = radix_tree_delete(&brd->brd_pages, pos);
			BUG_ON(!ret || ret != pages[i]);
			__free_page(pages[i]);
		}

		pos++;

156 157 158 159 160 161
		/*
		 * It takes 3.4 seconds to remove 80GiB ramdisk.
		 * So, we need cond_resched to avoid stalling the CPU.
		 */
		cond_resched();

N
Nick Piggin 已提交
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
		/*
		 * This assumes radix_tree_gang_lookup always returns as
		 * many pages as possible. If the radix-tree code changes,
		 * so will this have to.
		 */
	} while (nr_pages == FREE_BATCH);
}

/*
 * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
 */
static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
{
	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
	size_t copy;

	copy = min_t(size_t, n, PAGE_SIZE - offset);
	if (!brd_insert_page(brd, sector))
180
		return -ENOSPC;
N
Nick Piggin 已提交
181 182 183
	if (copy < n) {
		sector += copy >> SECTOR_SHIFT;
		if (!brd_insert_page(brd, sector))
184
			return -ENOSPC;
N
Nick Piggin 已提交
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
	}
	return 0;
}

/*
 * Copy n bytes from src to the brd starting at sector. Does not sleep.
 */
static void copy_to_brd(struct brd_device *brd, const void *src,
			sector_t sector, size_t n)
{
	struct page *page;
	void *dst;
	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
	size_t copy;

	copy = min_t(size_t, n, PAGE_SIZE - offset);
	page = brd_lookup_page(brd, sector);
	BUG_ON(!page);

204
	dst = kmap_atomic(page);
N
Nick Piggin 已提交
205
	memcpy(dst + offset, src, copy);
206
	kunmap_atomic(dst);
N
Nick Piggin 已提交
207 208 209 210 211 212 213 214

	if (copy < n) {
		src += copy;
		sector += copy >> SECTOR_SHIFT;
		copy = n - copy;
		page = brd_lookup_page(brd, sector);
		BUG_ON(!page);

215
		dst = kmap_atomic(page);
N
Nick Piggin 已提交
216
		memcpy(dst, src, copy);
217
		kunmap_atomic(dst);
N
Nick Piggin 已提交
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
	}
}

/*
 * Copy n bytes to dst from the brd starting at sector. Does not sleep.
 */
static void copy_from_brd(void *dst, struct brd_device *brd,
			sector_t sector, size_t n)
{
	struct page *page;
	void *src;
	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
	size_t copy;

	copy = min_t(size_t, n, PAGE_SIZE - offset);
	page = brd_lookup_page(brd, sector);
	if (page) {
235
		src = kmap_atomic(page);
N
Nick Piggin 已提交
236
		memcpy(dst, src + offset, copy);
237
		kunmap_atomic(src);
N
Nick Piggin 已提交
238 239 240 241 242 243 244 245 246
	} else
		memset(dst, 0, copy);

	if (copy < n) {
		dst += copy;
		sector += copy >> SECTOR_SHIFT;
		copy = n - copy;
		page = brd_lookup_page(brd, sector);
		if (page) {
247
			src = kmap_atomic(page);
N
Nick Piggin 已提交
248
			memcpy(dst, src, copy);
249
			kunmap_atomic(src);
N
Nick Piggin 已提交
250 251 252 253 254 255 256 257 258
		} else
			memset(dst, 0, copy);
	}
}

/*
 * Process a single bvec of a bio.
 */
static int brd_do_bvec(struct brd_device *brd, struct page *page,
259
			unsigned int len, unsigned int off, unsigned int op,
N
Nick Piggin 已提交
260 261 262 263 264
			sector_t sector)
{
	void *mem;
	int err = 0;

265
	if (op_is_write(op)) {
N
Nick Piggin 已提交
266 267 268 269 270
		err = copy_to_brd_setup(brd, sector, len);
		if (err)
			goto out;
	}

271
	mem = kmap_atomic(page);
272
	if (!op_is_write(op)) {
N
Nick Piggin 已提交
273 274
		copy_from_brd(mem + off, brd, sector, len);
		flush_dcache_page(page);
N
Nick Piggin 已提交
275 276
	} else {
		flush_dcache_page(page);
N
Nick Piggin 已提交
277
		copy_to_brd(brd, mem + off, sector, len);
N
Nick Piggin 已提交
278
	}
279
	kunmap_atomic(mem);
N
Nick Piggin 已提交
280 281 282 283 284

out:
	return err;
}

285
static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
N
Nick Piggin 已提交
286
{
287
	struct brd_device *brd = bio->bi_disk->private_data;
288
	struct bio_vec bvec;
N
Nick Piggin 已提交
289
	sector_t sector;
290
	struct bvec_iter iter;
N
Nick Piggin 已提交
291

292
	sector = bio->bi_iter.bi_sector;
293
	if (bio_end_sector(bio) > get_capacity(bio->bi_disk))
294
		goto io_error;
N
Nick Piggin 已提交
295

296 297
	bio_for_each_segment(bvec, bio, iter) {
		unsigned int len = bvec.bv_len;
298 299
		int err;

M
Ming Lei 已提交
300 301 302 303
		/* Don't support un-aligned buffer */
		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
				(len & (SECTOR_SIZE - 1)));

304
		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
305
				  bio_op(bio), sector);
N
Nick Piggin 已提交
306
		if (err)
307
			goto io_error;
N
Nick Piggin 已提交
308 309 310
		sector += len >> SECTOR_SHIFT;
	}

311
	bio_endio(bio);
312
	return BLK_QC_T_NONE;
313 314
io_error:
	bio_io_error(bio);
315
	return BLK_QC_T_NONE;
N
Nick Piggin 已提交
316 317
}

M
Matthew Wilcox 已提交
318
static int brd_rw_page(struct block_device *bdev, sector_t sector,
319
		       struct page *page, unsigned int op)
M
Matthew Wilcox 已提交
320 321
{
	struct brd_device *brd = bdev->bd_disk->private_data;
322 323 324 325
	int err;

	if (PageTransHuge(page))
		return -ENOTSUPP;
326 327
	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
	page_endio(page, op_is_write(op), err);
M
Matthew Wilcox 已提交
328 329 330
	return err;
}

331
static const struct block_device_operations brd_fops = {
N
Nick Piggin 已提交
332
	.owner =		THIS_MODULE,
M
Matthew Wilcox 已提交
333
	.rw_page =		brd_rw_page,
N
Nick Piggin 已提交
334 335 336 337 338
};

/*
 * And now the modules code and kernel interface.
 */
B
Boaz Harrosh 已提交
339
static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
340
module_param(rd_nr, int, 0444);
N
Nick Piggin 已提交
341
MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
B
Boaz Harrosh 已提交
342

J
Jan Kara 已提交
343
unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
344
module_param(rd_size, ulong, 0444);
N
Nick Piggin 已提交
345
MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
B
Boaz Harrosh 已提交
346 347

static int max_part = 1;
348
module_param(max_part, int, 0444);
B
Boaz Harrosh 已提交
349 350
MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");

N
Nick Piggin 已提交
351 352
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
353
MODULE_ALIAS("rd");
N
Nick Piggin 已提交
354 355 356 357 358 359 360 361

#ifndef MODULE
/* Legacy boot options - nonmodular */
static int __init ramdisk_size(char *str)
{
	rd_size = simple_strtol(str, NULL, 0);
	return 1;
}
362
__setup("ramdisk_size=", ramdisk_size);
N
Nick Piggin 已提交
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386
#endif

/*
 * The device scheme is derived from loop.c. Keep them in synch where possible
 * (should share code eventually).
 */
static LIST_HEAD(brd_devices);
static DEFINE_MUTEX(brd_devices_mutex);

static struct brd_device *brd_alloc(int i)
{
	struct brd_device *brd;
	struct gendisk *disk;

	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
	if (!brd)
		goto out;
	brd->brd_number		= i;
	spin_lock_init(&brd->brd_lock);
	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);

	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
	if (!brd->brd_queue)
		goto out_free_dev;
387

N
Nick Piggin 已提交
388 389
	blk_queue_make_request(brd->brd_queue, brd_make_request);

390 391 392 393 394 395 396
	/* This is so fdisk will align partitions on 4k, because of
	 * direct_access API needing 4k alignment, returning a PFN
	 * (This is only a problem on very small devices <= 4M,
	 *  otherwise fdisk will align on 1M. Regardless this call
	 *  is harmless)
	 */
	blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
B
Boaz Harrosh 已提交
397
	disk = brd->brd_disk = alloc_disk(max_part);
N
Nick Piggin 已提交
398 399 400
	if (!disk)
		goto out_free_queue;
	disk->major		= RAMDISK_MAJOR;
B
Boaz Harrosh 已提交
401
	disk->first_minor	= i * max_part;
N
Nick Piggin 已提交
402 403
	disk->fops		= &brd_fops;
	disk->private_data	= brd;
B
Boaz Harrosh 已提交
404
	disk->flags		= GENHD_FL_EXT_DEVT;
N
Nick Piggin 已提交
405 406
	sprintf(disk->disk_name, "ram%d", i);
	set_capacity(disk, rd_size * 2);
407
	brd->brd_queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
N
Nick Piggin 已提交
408

S
SeongJae Park 已提交
409
	/* Tell the block layer that this is not a rotational device */
410 411
	blk_queue_flag_set(QUEUE_FLAG_NONROT, brd->brd_queue);
	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, brd->brd_queue);
S
SeongJae Park 已提交
412

N
Nick Piggin 已提交
413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430
	return brd;

out_free_queue:
	blk_cleanup_queue(brd->brd_queue);
out_free_dev:
	kfree(brd);
out:
	return NULL;
}

static void brd_free(struct brd_device *brd)
{
	put_disk(brd->brd_disk);
	blk_cleanup_queue(brd->brd_queue);
	brd_free_pages(brd);
	kfree(brd);
}

B
Boaz Harrosh 已提交
431
static struct brd_device *brd_init_one(int i, bool *new)
N
Nick Piggin 已提交
432 433 434
{
	struct brd_device *brd;

B
Boaz Harrosh 已提交
435
	*new = false;
N
Nick Piggin 已提交
436 437 438 439 440 441 442
	list_for_each_entry(brd, &brd_devices, brd_list) {
		if (brd->brd_number == i)
			goto out;
	}

	brd = brd_alloc(i);
	if (brd) {
443
		brd->brd_disk->queue = brd->brd_queue;
N
Nick Piggin 已提交
444 445 446
		add_disk(brd->brd_disk);
		list_add_tail(&brd->brd_list, &brd_devices);
	}
B
Boaz Harrosh 已提交
447
	*new = true;
N
Nick Piggin 已提交
448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
out:
	return brd;
}

static void brd_del_one(struct brd_device *brd)
{
	list_del(&brd->brd_list);
	del_gendisk(brd->brd_disk);
	brd_free(brd);
}

static struct kobject *brd_probe(dev_t dev, int *part, void *data)
{
	struct brd_device *brd;
	struct kobject *kobj;
B
Boaz Harrosh 已提交
463
	bool new;
N
Nick Piggin 已提交
464 465

	mutex_lock(&brd_devices_mutex);
B
Boaz Harrosh 已提交
466
	brd = brd_init_one(MINOR(dev) / max_part, &new);
467
	kobj = brd ? get_disk_and_module(brd->brd_disk) : NULL;
N
Nick Piggin 已提交
468 469
	mutex_unlock(&brd_devices_mutex);

B
Boaz Harrosh 已提交
470 471 472
	if (new)
		*part = 0;

N
Nick Piggin 已提交
473 474 475
	return kobj;
}

Z
Zhiqiang Liu 已提交
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
static inline void brd_check_and_reset_par(void)
{
	if (unlikely(!max_part))
		max_part = 1;

	/*
	 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
	 * otherwise, it is possiable to get same dev_t when adding partitions.
	 */
	if ((1U << MINORBITS) % max_part != 0)
		max_part = 1UL << fls(max_part);

	if (max_part > DISK_MAX_PARTS) {
		pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
			DISK_MAX_PARTS, DISK_MAX_PARTS);
		max_part = DISK_MAX_PARTS;
	}
}

N
Nick Piggin 已提交
495 496 497
static int __init brd_init(void)
{
	struct brd_device *brd, *next;
B
Boaz Harrosh 已提交
498
	int i;
N
Nick Piggin 已提交
499 500 501 502 503

	/*
	 * brd module now has a feature to instantiate underlying device
	 * structure on-demand, provided that there is an access dev node.
	 *
B
Boaz Harrosh 已提交
504 505 506 507 508 509 510 511 512
	 * (1) if rd_nr is specified, create that many upfront. else
	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
	 * (2) User can further extend brd devices by create dev node themselves
	 *     and have kernel automatically instantiate actual device
	 *     on-demand. Example:
	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
	 *		fdisk -l /path/devnod_name
	 *	If (X / max_part) was not already created it will be created
	 *	dynamically.
N
Nick Piggin 已提交
513
	 */
514

N
Nick Piggin 已提交
515 516 517
	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
		return -EIO;

Z
Zhiqiang Liu 已提交
518
	brd_check_and_reset_par();
B
Boaz Harrosh 已提交
519 520

	for (i = 0; i < rd_nr; i++) {
N
Nick Piggin 已提交
521 522 523 524 525 526 527 528
		brd = brd_alloc(i);
		if (!brd)
			goto out_free;
		list_add_tail(&brd->brd_list, &brd_devices);
	}

	/* point of no return */

529 530 531 532 533 534
	list_for_each_entry(brd, &brd_devices, brd_list) {
		/*
		 * associate with queue just before adding disk for
		 * avoiding to mess up failure path
		 */
		brd->brd_disk->queue = brd->brd_queue;
N
Nick Piggin 已提交
535
		add_disk(brd->brd_disk);
536
	}
N
Nick Piggin 已提交
537

B
Boaz Harrosh 已提交
538
	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
N
Nick Piggin 已提交
539 540
				  THIS_MODULE, brd_probe, NULL, NULL);

B
Boaz Harrosh 已提交
541
	pr_info("brd: module loaded\n");
N
Nick Piggin 已提交
542 543 544 545 546 547 548
	return 0;

out_free:
	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
		list_del(&brd->brd_list);
		brd_free(brd);
	}
549
	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
N
Nick Piggin 已提交
550

B
Boaz Harrosh 已提交
551
	pr_info("brd: module NOT loaded !!!\n");
N
Nick Piggin 已提交
552 553 554 555 556 557 558 559 560 561
	return -ENOMEM;
}

static void __exit brd_exit(void)
{
	struct brd_device *brd, *next;

	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
		brd_del_one(brd);

B
Boaz Harrosh 已提交
562
	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
N
Nick Piggin 已提交
563
	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
B
Boaz Harrosh 已提交
564 565

	pr_info("brd: module unloaded\n");
N
Nick Piggin 已提交
566 567 568 569 570
}

module_init(brd_init);
module_exit(brd_exit);