zram_drv.c 27.8 KB
Newer Older
1
/*
2
 * Compressed RAM block device
3
 *
4
 * Copyright (C) 2008, 2009, 2010  Nitin Gupta
M
Minchan Kim 已提交
5
 *               2012, 2013 Minchan Kim
6 7 8 9 10 11 12 13 14
 *
 * This code is released using a dual license strategy: BSD/GPL
 * You can choose the licence that better fits your requirements.
 *
 * Released under the terms of 3-clause BSD License
 * Released under the terms of GNU General Public License Version 2.0
 *
 */

15
#define KMSG_COMPONENT "zram"
16 17
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

18 19 20 21
#ifdef CONFIG_ZRAM_DEBUG
#define DEBUG
#endif

22 23
#include <linux/module.h>
#include <linux/kernel.h>
24
#include <linux/bio.h>
25 26 27 28 29 30
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
31
#include <linux/slab.h>
32 33
#include <linux/string.h>
#include <linux/vmalloc.h>
34
#include <linux/err.h>
35

36
#include "zram_drv.h"
37 38

/* Globals */
39
static int zram_major;
40
static struct zram *zram_devices;
41
static const char *default_compressor = "lzo";
42 43

/* Module params (documentation at end) */
44
static unsigned int num_devices = 1;
45

46
#define ZRAM_ATTR_RO(name)						\
47
static ssize_t name##_show(struct device *d,		\
48 49 50
				struct device_attribute *attr, char *b)	\
{									\
	struct zram *zram = dev_to_zram(d);				\
51
	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
52 53
		(u64)atomic64_read(&zram->stats.name));			\
}									\
54
static DEVICE_ATTR_RO(name);
55

56 57 58 59 60
static inline int init_done(struct zram *zram)
{
	return zram->meta != NULL;
}

61 62 63 64 65 66 67 68 69 70
static inline struct zram *dev_to_zram(struct device *dev)
{
	return (struct zram *)dev_to_disk(dev)->private_data;
}

static ssize_t disksize_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct zram *zram = dev_to_zram(dev);

71
	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
72 73 74 75 76
}

static ssize_t initstate_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
77
	u32 val;
78 79
	struct zram *zram = dev_to_zram(dev);

80 81 82
	down_read(&zram->init_lock);
	val = init_done(zram);
	up_read(&zram->init_lock);
83

84
	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
85 86 87 88 89 90 91
}

static ssize_t orig_data_size_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct zram *zram = dev_to_zram(dev);

92
	return scnprintf(buf, PAGE_SIZE, "%llu\n",
93
		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
94 95 96 97 98 99 100 101 102
}

static ssize_t mem_used_total_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val = 0;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
103 104
	if (init_done(zram)) {
		struct zram_meta *meta = zram->meta;
105
		val = zs_get_total_pages(meta->mem_pool);
106
	}
107 108
	up_read(&zram->init_lock);

109
	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
110 111
}

112 113 114 115 116 117 118 119 120 121
static ssize_t max_comp_streams_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	int val;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	val = zram->max_comp_streams;
	up_read(&zram->init_lock);

122
	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
123 124
}

M
Minchan Kim 已提交
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
static ssize_t mem_limit_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	val = zram->limit_pages;
	up_read(&zram->init_lock);

	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}

static ssize_t mem_limit_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	u64 limit;
	char *tmp;
	struct zram *zram = dev_to_zram(dev);

	limit = memparse(buf, &tmp);
	if (buf == tmp) /* no chars parsed, invalid input */
		return -EINVAL;

	down_write(&zram->init_lock);
	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
	up_write(&zram->init_lock);

	return len;
}

M
Minchan Kim 已提交
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
static ssize_t mem_used_max_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val = 0;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	if (init_done(zram))
		val = atomic_long_read(&zram->stats.max_used_pages);
	up_read(&zram->init_lock);

	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}

static ssize_t mem_used_max_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int err;
	unsigned long val;
	struct zram *zram = dev_to_zram(dev);

	err = kstrtoul(buf, 10, &val);
	if (err || val != 0)
		return -EINVAL;

	down_read(&zram->init_lock);
182 183
	if (init_done(zram)) {
		struct zram_meta *meta = zram->meta;
M
Minchan Kim 已提交
184 185
		atomic_long_set(&zram->stats.max_used_pages,
				zs_get_total_pages(meta->mem_pool));
186
	}
M
Minchan Kim 已提交
187 188 189 190 191
	up_read(&zram->init_lock);

	return len;
}

192 193 194 195 196
static ssize_t max_comp_streams_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int num;
	struct zram *zram = dev_to_zram(dev);
M
Minchan Kim 已提交
197
	int ret;
198

M
Minchan Kim 已提交
199 200 201
	ret = kstrtoint(buf, 0, &num);
	if (ret < 0)
		return ret;
202 203
	if (num < 1)
		return -EINVAL;
M
Minchan Kim 已提交
204

205 206
	down_write(&zram->init_lock);
	if (init_done(zram)) {
M
Minchan Kim 已提交
207
		if (!zcomp_set_max_streams(zram->comp, num)) {
208
			pr_info("Cannot change max compression streams\n");
M
Minchan Kim 已提交
209 210 211
			ret = -EINVAL;
			goto out;
		}
212
	}
M
Minchan Kim 已提交
213

214
	zram->max_comp_streams = num;
M
Minchan Kim 已提交
215 216
	ret = len;
out:
217
	up_write(&zram->init_lock);
M
Minchan Kim 已提交
218
	return ret;
219 220
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
static ssize_t comp_algorithm_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	size_t sz;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	sz = zcomp_available_show(zram->compressor, buf);
	up_read(&zram->init_lock);

	return sz;
}

static ssize_t comp_algorithm_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	struct zram *zram = dev_to_zram(dev);
	down_write(&zram->init_lock);
	if (init_done(zram)) {
		up_write(&zram->init_lock);
		pr_info("Can't change algorithm for initialized device\n");
		return -EBUSY;
	}
	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
	up_write(&zram->init_lock);
	return len;
}

M
Minchan Kim 已提交
249
/* flag operations needs meta->tb_lock */
M
Minchan Kim 已提交
250
static int zram_test_flag(struct zram_meta *meta, u32 index,
251
			enum zram_pageflags flag)
252
{
253
	return meta->table[index].value & BIT(flag);
254 255
}

M
Minchan Kim 已提交
256
static void zram_set_flag(struct zram_meta *meta, u32 index,
257
			enum zram_pageflags flag)
258
{
259
	meta->table[index].value |= BIT(flag);
260 261
}

M
Minchan Kim 已提交
262
static void zram_clear_flag(struct zram_meta *meta, u32 index,
263
			enum zram_pageflags flag)
264
{
265 266 267 268 269 270 271 272 273 274 275 276 277 278
	meta->table[index].value &= ~BIT(flag);
}

static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
{
	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
}

static void zram_set_obj_size(struct zram_meta *meta,
					u32 index, size_t size)
{
	unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;

	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
279 280
}

281 282 283 284 285 286 287 288
static inline int is_partial_io(struct bio_vec *bvec)
{
	return bvec->bv_len != PAGE_SIZE;
}

/*
 * Check if request is within bounds and aligned on zram logical blocks.
 */
289 290
static inline int valid_io_request(struct zram *zram,
		sector_t start, unsigned int size)
291
{
292
	u64 end, bound;
293

294
	/* unaligned request */
295
	if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
296
		return 0;
297
	if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
298 299
		return 0;

300
	end = start + (size >> SECTOR_SHIFT);
301 302
	bound = zram->disksize >> SECTOR_SHIFT;
	/* out of range range */
303
	if (unlikely(start >= bound || end > bound || start > end))
304 305 306 307 308 309
		return 0;

	/* I/O request is valid */
	return 1;
}

310
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
311
{
312 313 314 315 316 317 318 319 320 321 322 323 324
	size_t num_pages = disksize >> PAGE_SHIFT;
	size_t index;

	/* Free all pages that are still in this zram device */
	for (index = 0; index < num_pages; index++) {
		unsigned long handle = meta->table[index].handle;

		if (!handle)
			continue;

		zs_free(meta->mem_pool, handle);
	}

325 326 327 328 329 330 331 332 333
	zs_destroy_pool(meta->mem_pool);
	vfree(meta->table);
	kfree(meta);
}

static struct zram_meta *zram_meta_alloc(u64 disksize)
{
	size_t num_pages;
	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
334

335
	if (!meta)
336
		return NULL;
337 338 339 340 341

	num_pages = disksize >> PAGE_SHIFT;
	meta->table = vzalloc(num_pages * sizeof(*meta->table));
	if (!meta->table) {
		pr_err("Error allocating zram address table\n");
342
		goto out_error;
343 344 345 346 347
	}

	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
	if (!meta->mem_pool) {
		pr_err("Error creating memory pool\n");
348
		goto out_error;
349 350 351 352
	}

	return meta;

353
out_error:
354 355
	vfree(meta->table);
	kfree(meta);
356
	return NULL;
357 358 359 360 361 362 363 364 365
}

static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
{
	if (*offset + bvec->bv_len >= PAGE_SIZE)
		(*index)++;
	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
}

366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
static int page_zero_filled(void *ptr)
{
	unsigned int pos;
	unsigned long *page;

	page = (unsigned long *)ptr;

	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
		if (page[pos])
			return 0;
	}

	return 1;
}

381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
static void handle_zero_page(struct bio_vec *bvec)
{
	struct page *page = bvec->bv_page;
	void *user_mem;

	user_mem = kmap_atomic(page);
	if (is_partial_io(bvec))
		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
	else
		clear_page(user_mem);
	kunmap_atomic(user_mem);

	flush_dcache_page(page);
}

396 397 398 399 400 401

/*
 * To protect concurrent access to the same index entry,
 * caller should hold this table index entry's bit_spinlock to
 * indicate this index entry is accessing.
 */
402
static void zram_free_page(struct zram *zram, size_t index)
403
{
M
Minchan Kim 已提交
404 405
	struct zram_meta *meta = zram->meta;
	unsigned long handle = meta->table[index].handle;
406

407
	if (unlikely(!handle)) {
408 409 410 411
		/*
		 * No memory is allocated for zero filled pages.
		 * Simply clear zero page flag.
		 */
M
Minchan Kim 已提交
412 413
		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
			zram_clear_flag(meta, index, ZRAM_ZERO);
414
			atomic64_dec(&zram->stats.zero_pages);
415 416 417 418
		}
		return;
	}

M
Minchan Kim 已提交
419
	zs_free(meta->mem_pool, handle);
420

421 422
	atomic64_sub(zram_get_obj_size(meta, index),
			&zram->stats.compr_data_size);
423
	atomic64_dec(&zram->stats.pages_stored);
424

M
Minchan Kim 已提交
425
	meta->table[index].handle = 0;
426
	zram_set_obj_size(meta, index, 0);
427 428
}

429
static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
430
{
431
	int ret = 0;
432
	unsigned char *cmem;
M
Minchan Kim 已提交
433
	struct zram_meta *meta = zram->meta;
M
Minchan Kim 已提交
434
	unsigned long handle;
M
Minchan Kim 已提交
435
	size_t size;
M
Minchan Kim 已提交
436

437
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
M
Minchan Kim 已提交
438
	handle = meta->table[index].handle;
439
	size = zram_get_obj_size(meta, index);
440

M
Minchan Kim 已提交
441
	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
442
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
443
		clear_page(mem);
444 445
		return 0;
	}
446

M
Minchan Kim 已提交
447
	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
M
Minchan Kim 已提交
448
	if (size == PAGE_SIZE)
449
		copy_page(mem, cmem);
450
	else
451
		ret = zcomp_decompress(zram->comp, cmem, size, mem);
M
Minchan Kim 已提交
452
	zs_unmap_object(meta->mem_pool, handle);
453
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
454

455
	/* Should NEVER happen. Return bio error if it does. */
456
	if (unlikely(ret)) {
457 458
		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
		return ret;
459
	}
460

461
	return 0;
462 463
}

464
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
465
			  u32 index, int offset)
466 467
{
	int ret;
468 469
	struct page *page;
	unsigned char *user_mem, *uncmem = NULL;
M
Minchan Kim 已提交
470
	struct zram_meta *meta = zram->meta;
471 472
	page = bvec->bv_page;

473
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
M
Minchan Kim 已提交
474 475
	if (unlikely(!meta->table[index].handle) ||
			zram_test_flag(meta, index, ZRAM_ZERO)) {
476
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
477
		handle_zero_page(bvec);
478 479
		return 0;
	}
480
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
481

482 483
	if (is_partial_io(bvec))
		/* Use  a temporary buffer to decompress the page */
484 485 486 487
		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);

	user_mem = kmap_atomic(page);
	if (!is_partial_io(bvec))
488 489 490 491 492 493 494
		uncmem = user_mem;

	if (!uncmem) {
		pr_info("Unable to allocate temp memory\n");
		ret = -ENOMEM;
		goto out_cleanup;
	}
495

496
	ret = zram_decompress_page(zram, uncmem, index);
497
	/* Should NEVER happen. Return bio error if it does. */
498
	if (unlikely(ret))
499
		goto out_cleanup;
500

501 502 503 504 505 506 507 508 509 510 511
	if (is_partial_io(bvec))
		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
				bvec->bv_len);

	flush_dcache_page(page);
	ret = 0;
out_cleanup:
	kunmap_atomic(user_mem);
	if (is_partial_io(bvec))
		kfree(uncmem);
	return ret;
512 513
}

M
Minchan Kim 已提交
514 515 516 517 518 519 520 521 522 523 524 525 526 527 528
static inline void update_used_max(struct zram *zram,
					const unsigned long pages)
{
	int old_max, cur_max;

	old_max = atomic_long_read(&zram->stats.max_used_pages);

	do {
		cur_max = old_max;
		if (pages > cur_max)
			old_max = atomic_long_cmpxchg(
				&zram->stats.max_used_pages, cur_max, pages);
	} while (old_max != cur_max);
}

529 530
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
			   int offset)
531
{
532
	int ret = 0;
533
	size_t clen;
534
	unsigned long handle;
535
	struct page *page;
536
	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
M
Minchan Kim 已提交
537
	struct zram_meta *meta = zram->meta;
538
	struct zcomp_strm *zstrm;
539
	bool locked = false;
M
Minchan Kim 已提交
540
	unsigned long alloced_pages;
541

542
	page = bvec->bv_page;
543 544 545 546 547
	if (is_partial_io(bvec)) {
		/*
		 * This is a partial IO. We need to read the full page
		 * before to write the changes.
		 */
548
		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
549 550 551 552
		if (!uncmem) {
			ret = -ENOMEM;
			goto out;
		}
553
		ret = zram_decompress_page(zram, uncmem, index);
554
		if (ret)
555 556 557
			goto out;
	}

558
	zstrm = zcomp_strm_find(zram->comp);
559
	locked = true;
560
	user_mem = kmap_atomic(page);
561

562
	if (is_partial_io(bvec)) {
563 564
		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
		       bvec->bv_len);
565 566 567
		kunmap_atomic(user_mem);
		user_mem = NULL;
	} else {
568
		uncmem = user_mem;
569
	}
570 571

	if (page_zero_filled(uncmem)) {
572 573
		if (user_mem)
			kunmap_atomic(user_mem);
574
		/* Free memory associated with this sector now. */
575
		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
576
		zram_free_page(zram, index);
M
Minchan Kim 已提交
577
		zram_set_flag(meta, index, ZRAM_ZERO);
578
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
579

580
		atomic64_inc(&zram->stats.zero_pages);
581 582
		ret = 0;
		goto out;
583
	}
584

585
	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
586 587 588 589 590
	if (!is_partial_io(bvec)) {
		kunmap_atomic(user_mem);
		user_mem = NULL;
		uncmem = NULL;
	}
591

592
	if (unlikely(ret)) {
593
		pr_err("Compression failed! err=%d\n", ret);
594
		goto out;
595
	}
596
	src = zstrm->buffer;
597 598
	if (unlikely(clen > max_zpage_size)) {
		clen = PAGE_SIZE;
599 600
		if (is_partial_io(bvec))
			src = uncmem;
601
	}
602

M
Minchan Kim 已提交
603
	handle = zs_malloc(meta->mem_pool, clen);
604
	if (!handle) {
605 606
		pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
			index, clen);
607 608
		ret = -ENOMEM;
		goto out;
609
	}
M
Minchan Kim 已提交
610

M
Minchan Kim 已提交
611 612
	alloced_pages = zs_get_total_pages(meta->mem_pool);
	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
M
Minchan Kim 已提交
613 614 615 616 617
		zs_free(meta->mem_pool, handle);
		ret = -ENOMEM;
		goto out;
	}

M
Minchan Kim 已提交
618 619
	update_used_max(zram, alloced_pages);

M
Minchan Kim 已提交
620
	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
621

622
	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
623
		src = kmap_atomic(page);
624
		copy_page(cmem, src);
625
		kunmap_atomic(src);
626 627 628
	} else {
		memcpy(cmem, src, clen);
	}
629

630 631
	zcomp_strm_release(zram->comp, zstrm);
	locked = false;
M
Minchan Kim 已提交
632
	zs_unmap_object(meta->mem_pool, handle);
633

634 635 636 637
	/*
	 * Free memory associated with this sector
	 * before overwriting unused sectors.
	 */
638
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
639 640
	zram_free_page(zram, index);

M
Minchan Kim 已提交
641
	meta->table[index].handle = handle;
642 643
	zram_set_obj_size(meta, index, clen);
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
644

645
	/* Update stats */
646 647
	atomic64_add(clen, &zram->stats.compr_data_size);
	atomic64_inc(&zram->stats.pages_stored);
648
out:
649
	if (locked)
650
		zcomp_strm_release(zram->comp, zstrm);
651 652
	if (is_partial_io(bvec))
		kfree(uncmem);
653
	return ret;
654 655 656
}

static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
657
			int offset, int rw)
658
{
659
	int ret;
660

661 662
	if (rw == READ) {
		atomic64_inc(&zram->stats.num_reads);
663
		ret = zram_bvec_read(zram, bvec, index, offset);
664 665
	} else {
		atomic64_inc(&zram->stats.num_writes);
666
		ret = zram_bvec_write(zram, bvec, index, offset);
667
	}
668

669 670 671 672 673 674 675
	if (unlikely(ret)) {
		if (rw == READ)
			atomic64_inc(&zram->stats.failed_reads);
		else
			atomic64_inc(&zram->stats.failed_writes);
	}

676
	return ret;
677 678
}

J
Joonsoo Kim 已提交
679 680 681 682 683 684 685 686 687
/*
 * zram_bio_discard - handler on discard request
 * @index: physical block index in PAGE_SIZE units
 * @offset: byte offset within physical block
 */
static void zram_bio_discard(struct zram *zram, u32 index,
			     int offset, struct bio *bio)
{
	size_t n = bio->bi_iter.bi_size;
688
	struct zram_meta *meta = zram->meta;
J
Joonsoo Kim 已提交
689 690 691 692 693 694 695 696 697 698 699 700

	/*
	 * zram manages data in physical block size units. Because logical block
	 * size isn't identical with physical block size on some arch, we
	 * could get a discard request pointing to a specific offset within a
	 * certain physical block.  Although we can handle this request by
	 * reading that physiclal block and decompressing and partially zeroing
	 * and re-compressing and then re-storing it, this isn't reasonable
	 * because our intent with a discard request is to save memory.  So
	 * skipping this logical block is appropriate here.
	 */
	if (offset) {
701
		if (n <= (PAGE_SIZE - offset))
J
Joonsoo Kim 已提交
702 703
			return;

704
		n -= (PAGE_SIZE - offset);
J
Joonsoo Kim 已提交
705 706 707 708
		index++;
	}

	while (n >= PAGE_SIZE) {
709
		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
J
Joonsoo Kim 已提交
710
		zram_free_page(zram, index);
711
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
712
		atomic64_inc(&zram->stats.notify_free);
J
Joonsoo Kim 已提交
713 714 715 716 717
		index++;
		n -= PAGE_SIZE;
	}
}

718
static void zram_reset_device(struct zram *zram)
719
{
720
	down_write(&zram->init_lock);
M
Minchan Kim 已提交
721 722 723

	zram->limit_pages = 0;

724
	if (!init_done(zram)) {
725
		up_write(&zram->init_lock);
726
		return;
727
	}
728

729
	zcomp_destroy(zram->comp);
730
	zram->max_comp_streams = 1;
731
	zram_meta_free(zram->meta, zram->disksize);
732 733 734 735 736
	zram->meta = NULL;
	/* Reset stats */
	memset(&zram->stats, 0, sizeof(zram->stats));

	zram->disksize = 0;
737
	up_write(&zram->init_lock);
738 739 740 741 742 743
}

static ssize_t disksize_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	u64 disksize;
744
	struct zcomp *comp;
745 746
	struct zram_meta *meta;
	struct zram *zram = dev_to_zram(dev);
747
	int err;
748 749 750 751 752 753 754

	disksize = memparse(buf, NULL);
	if (!disksize)
		return -EINVAL;

	disksize = PAGE_ALIGN(disksize);
	meta = zram_meta_alloc(disksize);
755 756
	if (!meta)
		return -ENOMEM;
757

758
	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
759
	if (IS_ERR(comp)) {
760 761
		pr_info("Cannot initialise %s compressing backend\n",
				zram->compressor);
762 763
		err = PTR_ERR(comp);
		goto out_free_meta;
764 765
	}

766
	down_write(&zram->init_lock);
767
	if (init_done(zram)) {
768
		pr_info("Cannot change disksize for initialized device\n");
769
		err = -EBUSY;
770
		goto out_destroy_comp;
771 772
	}

773
	zram->meta = meta;
774
	zram->comp = comp;
775 776 777
	zram->disksize = disksize;
	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
	up_write(&zram->init_lock);
778 779 780 781 782 783 784 785

	/*
	 * Revalidate disk out of the init_lock to avoid lockdep splat.
	 * It's okay because disk's capacity is protected by init_lock
	 * so that revalidate_disk always sees up-to-date capacity.
	 */
	revalidate_disk(zram->disk);

786
	return len;
787

788 789 790 791
out_destroy_comp:
	up_write(&zram->init_lock);
	zcomp_destroy(comp);
out_free_meta:
792
	zram_meta_free(meta, disksize);
793
	return err;
794 795 796 797 798 799 800 801 802 803 804 805 806
}

static ssize_t reset_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int ret;
	unsigned short do_reset;
	struct zram *zram;
	struct block_device *bdev;

	zram = dev_to_zram(dev);
	bdev = bdget_disk(zram->disk, 0);

807 808 809
	if (!bdev)
		return -ENOMEM;

810
	mutex_lock(&bdev->bd_mutex);
811
	/* Do not reset an active device! */
812 813 814 815
	if (bdev->bd_holders) {
		ret = -EBUSY;
		goto out;
	}
816 817 818

	ret = kstrtou16(buf, 10, &do_reset);
	if (ret)
819
		goto out;
820

821 822 823 824
	if (!do_reset) {
		ret = -EINVAL;
		goto out;
	}
825 826

	/* Make sure all pending I/O is finished */
827
	fsync_bdev(bdev);
828 829 830 831 832
	zram_reset_device(zram);
	set_capacity(zram->disk, 0);

	mutex_unlock(&bdev->bd_mutex);
	revalidate_disk(zram->disk);
833
	bdput(bdev);
834 835

	return len;
836 837

out:
838
	mutex_unlock(&bdev->bd_mutex);
839 840
	bdput(bdev);
	return ret;
841 842
}

843
static void __zram_make_request(struct zram *zram, struct bio *bio)
844
{
845
	int offset, rw;
846
	u32 index;
847 848
	struct bio_vec bvec;
	struct bvec_iter iter;
849

850 851 852
	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
	offset = (bio->bi_iter.bi_sector &
		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
853

J
Joonsoo Kim 已提交
854 855 856 857 858 859
	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
		zram_bio_discard(zram, index, offset, bio);
		bio_endio(bio, 0);
		return;
	}

860
	rw = bio_data_dir(bio);
861
	bio_for_each_segment(bvec, bio, iter) {
862 863
		int max_transfer_size = PAGE_SIZE - offset;

864
		if (bvec.bv_len > max_transfer_size) {
865 866 867 868 869 870
			/*
			 * zram_bvec_rw() can only make operation on a single
			 * zram page. Split the bio vector.
			 */
			struct bio_vec bv;

871
			bv.bv_page = bvec.bv_page;
872
			bv.bv_len = max_transfer_size;
873
			bv.bv_offset = bvec.bv_offset;
874

875
			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
876 877
				goto out;

878
			bv.bv_len = bvec.bv_len - max_transfer_size;
879
			bv.bv_offset += max_transfer_size;
880
			if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
881 882
				goto out;
		} else
883
			if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
884 885
				goto out;

886
		update_position(&index, &offset, &bvec);
887
	}
888 889 890

	set_bit(BIO_UPTODATE, &bio->bi_flags);
	bio_endio(bio, 0);
891
	return;
892 893 894 895 896 897

out:
	bio_io_error(bio);
}

/*
898
 * Handler function for all zram I/O requests.
899
 */
900
static void zram_make_request(struct request_queue *queue, struct bio *bio)
901
{
902
	struct zram *zram = queue->queuedata;
903

904
	down_read(&zram->init_lock);
905
	if (unlikely(!init_done(zram)))
906
		goto error;
907

908 909
	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
					bio->bi_iter.bi_size)) {
910
		atomic64_inc(&zram->stats.invalid_io);
911
		goto error;
912 913
	}

914
	__zram_make_request(zram, bio);
915
	up_read(&zram->init_lock);
916

917
	return;
918 919

error:
920
	up_read(&zram->init_lock);
921
	bio_io_error(bio);
922 923
}

N
Nitin Gupta 已提交
924 925
static void zram_slot_free_notify(struct block_device *bdev,
				unsigned long index)
926
{
927
	struct zram *zram;
928
	struct zram_meta *meta;
929

930
	zram = bdev->bd_disk->private_data;
931
	meta = zram->meta;
932

933
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
934
	zram_free_page(zram, index);
935
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
936
	atomic64_inc(&zram->stats.notify_free);
937 938
}

939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981
static int zram_rw_page(struct block_device *bdev, sector_t sector,
		       struct page *page, int rw)
{
	int offset, err;
	u32 index;
	struct zram *zram;
	struct bio_vec bv;

	zram = bdev->bd_disk->private_data;
	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
		atomic64_inc(&zram->stats.invalid_io);
		return -EINVAL;
	}

	down_read(&zram->init_lock);
	if (unlikely(!init_done(zram))) {
		err = -EIO;
		goto out_unlock;
	}

	index = sector >> SECTORS_PER_PAGE_SHIFT;
	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;

	bv.bv_page = page;
	bv.bv_len = PAGE_SIZE;
	bv.bv_offset = 0;

	err = zram_bvec_rw(zram, &bv, index, offset, rw);
out_unlock:
	up_read(&zram->init_lock);
	/*
	 * If I/O fails, just return error(ie, non-zero) without
	 * calling page_endio.
	 * It causes resubmit the I/O with bio request by upper functions
	 * of rw_page(e.g., swap_readpage, __swap_writepage) and
	 * bio->bi_end_io does things to handle the error
	 * (e.g., SetPageError, set_page_dirty and extra works).
	 */
	if (err == 0)
		page_endio(page, rw, 0);
	return err;
}

982 983
static const struct block_device_operations zram_devops = {
	.swap_slot_free_notify = zram_slot_free_notify,
984
	.rw_page = zram_rw_page,
985
	.owner = THIS_MODULE
986 987
};

988 989 990 991 992 993 994 995 996
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
static DEVICE_ATTR_WO(reset);
static DEVICE_ATTR_RO(orig_data_size);
static DEVICE_ATTR_RO(mem_used_total);
static DEVICE_ATTR_RW(mem_limit);
static DEVICE_ATTR_RW(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
997

998 999
ZRAM_ATTR_RO(num_reads);
ZRAM_ATTR_RO(num_writes);
1000 1001
ZRAM_ATTR_RO(failed_reads);
ZRAM_ATTR_RO(failed_writes);
1002 1003 1004 1005 1006
ZRAM_ATTR_RO(invalid_io);
ZRAM_ATTR_RO(notify_free);
ZRAM_ATTR_RO(zero_pages);
ZRAM_ATTR_RO(compr_data_size);

1007 1008 1009 1010 1011 1012
static struct attribute *zram_disk_attrs[] = {
	&dev_attr_disksize.attr,
	&dev_attr_initstate.attr,
	&dev_attr_reset.attr,
	&dev_attr_num_reads.attr,
	&dev_attr_num_writes.attr,
1013 1014
	&dev_attr_failed_reads.attr,
	&dev_attr_failed_writes.attr,
1015 1016 1017 1018 1019 1020
	&dev_attr_invalid_io.attr,
	&dev_attr_notify_free.attr,
	&dev_attr_zero_pages.attr,
	&dev_attr_orig_data_size.attr,
	&dev_attr_compr_data_size.attr,
	&dev_attr_mem_used_total.attr,
M
Minchan Kim 已提交
1021
	&dev_attr_mem_limit.attr,
M
Minchan Kim 已提交
1022
	&dev_attr_mem_used_max.attr,
1023
	&dev_attr_max_comp_streams.attr,
1024
	&dev_attr_comp_algorithm.attr,
1025 1026 1027 1028 1029 1030 1031
	NULL,
};

static struct attribute_group zram_disk_attr_group = {
	.attrs = zram_disk_attrs,
};

1032
static int create_device(struct zram *zram, int device_id)
1033
{
1034
	int ret = -ENOMEM;
1035

1036
	init_rwsem(&zram->init_lock);
1037

1038 1039
	zram->queue = blk_alloc_queue(GFP_KERNEL);
	if (!zram->queue) {
1040 1041
		pr_err("Error allocating disk queue for device %d\n",
			device_id);
1042
		goto out;
1043 1044
	}

1045 1046
	blk_queue_make_request(zram->queue, zram_make_request);
	zram->queue->queuedata = zram;
1047 1048

	 /* gendisk structure */
1049 1050
	zram->disk = alloc_disk(1);
	if (!zram->disk) {
1051
		pr_warn("Error allocating disk structure for device %d\n",
1052
			device_id);
1053
		goto out_free_queue;
1054 1055
	}

1056 1057 1058 1059 1060 1061
	zram->disk->major = zram_major;
	zram->disk->first_minor = device_id;
	zram->disk->fops = &zram_devops;
	zram->disk->queue = zram->queue;
	zram->disk->private_data = zram;
	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1062

1063
	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1064
	set_capacity(zram->disk, 0);
1065 1066
	/* zram devices sort of resembles non-rotational disks */
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
1067
	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1068 1069 1070 1071
	/*
	 * To ensure that we always get PAGE_SIZE aligned
	 * and n*PAGE_SIZED sized I/O requests.
	 */
1072
	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1073 1074
	blk_queue_logical_block_size(zram->disk->queue,
					ZRAM_LOGICAL_BLOCK_SIZE);
1075 1076
	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
J
Joonsoo Kim 已提交
1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091
	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
	zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
	/*
	 * zram_bio_discard() will clear all logical blocks if logical block
	 * size is identical with physical block size(PAGE_SIZE). But if it is
	 * different, we will skip discarding some parts of logical blocks in
	 * the part of the request range which isn't aligned to physical block
	 * size.  So we can't ensure that all discarded logical blocks are
	 * zeroed.
	 */
	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
		zram->disk->queue->limits.discard_zeroes_data = 1;
	else
		zram->disk->queue->limits.discard_zeroes_data = 0;
	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
1092

1093
	add_disk(zram->disk);
1094

1095 1096 1097
	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
				&zram_disk_attr_group);
	if (ret < 0) {
1098
		pr_warn("Error creating sysfs group");
1099
		goto out_free_disk;
1100
	}
1101
	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1102
	zram->meta = NULL;
1103
	zram->max_comp_streams = 1;
1104
	return 0;
1105

1106 1107 1108 1109 1110
out_free_disk:
	del_gendisk(zram->disk);
	put_disk(zram->disk);
out_free_queue:
	blk_cleanup_queue(zram->queue);
1111 1112
out:
	return ret;
1113 1114
}

1115
static void destroy_device(struct zram *zram)
1116
{
1117 1118 1119
	sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
			&zram_disk_attr_group);

1120 1121
	del_gendisk(zram->disk);
	put_disk(zram->disk);
1122

1123
	blk_cleanup_queue(zram->queue);
1124 1125
}

1126
static int __init zram_init(void)
1127
{
1128
	int ret, dev_id;
1129

1130
	if (num_devices > max_num_devices) {
1131
		pr_warn("Invalid value for num_devices: %u\n",
1132
				num_devices);
1133 1134
		ret = -EINVAL;
		goto out;
1135 1136
	}

1137 1138
	zram_major = register_blkdev(0, "zram");
	if (zram_major <= 0) {
1139
		pr_warn("Unable to get major number\n");
1140 1141
		ret = -EBUSY;
		goto out;
1142 1143 1144
	}

	/* Allocate the device array and initialize each one */
1145
	zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
1146
	if (!zram_devices) {
1147 1148 1149
		ret = -ENOMEM;
		goto unregister;
	}
1150

1151
	for (dev_id = 0; dev_id < num_devices; dev_id++) {
1152
		ret = create_device(&zram_devices[dev_id], dev_id);
1153
		if (ret)
1154
			goto free_devices;
1155 1156
	}

1157 1158
	pr_info("Created %u device(s) ...\n", num_devices);

1159
	return 0;
1160

1161
free_devices:
1162
	while (dev_id)
1163 1164
		destroy_device(&zram_devices[--dev_id]);
	kfree(zram_devices);
1165
unregister:
1166
	unregister_blkdev(zram_major, "zram");
1167
out:
1168 1169 1170
	return ret;
}

1171
static void __exit zram_exit(void)
1172 1173
{
	int i;
1174
	struct zram *zram;
1175

1176
	for (i = 0; i < num_devices; i++) {
1177
		zram = &zram_devices[i];
1178

1179
		destroy_device(zram);
M
Minchan Kim 已提交
1180 1181 1182 1183
		/*
		 * Shouldn't access zram->disk after destroy_device
		 * because destroy_device already released zram->disk.
		 */
1184
		zram_reset_device(zram);
1185 1186
	}

1187
	unregister_blkdev(zram_major, "zram");
1188

1189
	kfree(zram_devices);
1190 1191 1192
	pr_debug("Cleanup done!\n");
}

1193 1194
module_init(zram_init);
module_exit(zram_exit);
1195

1196 1197 1198
module_param(num_devices, uint, 0);
MODULE_PARM_DESC(num_devices, "Number of zram devices");

1199 1200
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1201
MODULE_DESCRIPTION("Compressed RAM Block Device");