zram_drv.c 28.4 KB
Newer Older
1
/*
2
 * Compressed RAM block device
3
 *
4
 * Copyright (C) 2008, 2009, 2010  Nitin Gupta
M
Minchan Kim 已提交
5
 *               2012, 2013 Minchan Kim
6 7 8 9 10 11 12 13 14
 *
 * This code is released using a dual license strategy: BSD/GPL
 * You can choose the licence that better fits your requirements.
 *
 * Released under the terms of 3-clause BSD License
 * Released under the terms of GNU General Public License Version 2.0
 *
 */

15
#define KMSG_COMPONENT "zram"
16 17
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

18 19 20 21
#ifdef CONFIG_ZRAM_DEBUG
#define DEBUG
#endif

22 23
#include <linux/module.h>
#include <linux/kernel.h>
24
#include <linux/bio.h>
25 26 27 28 29 30
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
31
#include <linux/slab.h>
32 33
#include <linux/string.h>
#include <linux/vmalloc.h>
34
#include <linux/err.h>
35

36
#include "zram_drv.h"
37 38

/* Globals */
39
static int zram_major;
40
static struct zram *zram_devices;
41
static const char *default_compressor = "lzo";
42 43

/* Module params (documentation at end) */
44
static unsigned int num_devices = 1;
45

46
#define ZRAM_ATTR_RO(name)						\
47
static ssize_t name##_show(struct device *d,		\
48 49 50
				struct device_attribute *attr, char *b)	\
{									\
	struct zram *zram = dev_to_zram(d);				\
51
	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
52 53
		(u64)atomic64_read(&zram->stats.name));			\
}									\
54
static DEVICE_ATTR_RO(name);
55

56
static inline bool init_done(struct zram *zram)
57
{
58
	return zram->disksize;
59 60
}

61 62 63 64 65 66 67 68 69 70
static inline struct zram *dev_to_zram(struct device *dev)
{
	return (struct zram *)dev_to_disk(dev)->private_data;
}

static ssize_t disksize_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct zram *zram = dev_to_zram(dev);

71
	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
72 73 74 75 76
}

static ssize_t initstate_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
77
	u32 val;
78 79
	struct zram *zram = dev_to_zram(dev);

80 81 82
	down_read(&zram->init_lock);
	val = init_done(zram);
	up_read(&zram->init_lock);
83

84
	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
85 86 87 88 89 90 91
}

static ssize_t orig_data_size_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct zram *zram = dev_to_zram(dev);

92
	return scnprintf(buf, PAGE_SIZE, "%llu\n",
93
		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
94 95 96 97 98 99 100 101 102
}

static ssize_t mem_used_total_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val = 0;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
103 104
	if (init_done(zram)) {
		struct zram_meta *meta = zram->meta;
105
		val = zs_get_total_pages(meta->mem_pool);
106
	}
107 108
	up_read(&zram->init_lock);

109
	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
110 111
}

112 113 114 115 116 117 118 119 120 121
static ssize_t max_comp_streams_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	int val;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	val = zram->max_comp_streams;
	up_read(&zram->init_lock);

122
	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
123 124
}

M
Minchan Kim 已提交
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
static ssize_t mem_limit_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	val = zram->limit_pages;
	up_read(&zram->init_lock);

	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}

static ssize_t mem_limit_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	u64 limit;
	char *tmp;
	struct zram *zram = dev_to_zram(dev);

	limit = memparse(buf, &tmp);
	if (buf == tmp) /* no chars parsed, invalid input */
		return -EINVAL;

	down_write(&zram->init_lock);
	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
	up_write(&zram->init_lock);

	return len;
}

M
Minchan Kim 已提交
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
static ssize_t mem_used_max_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val = 0;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	if (init_done(zram))
		val = atomic_long_read(&zram->stats.max_used_pages);
	up_read(&zram->init_lock);

	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}

static ssize_t mem_used_max_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int err;
	unsigned long val;
	struct zram *zram = dev_to_zram(dev);

	err = kstrtoul(buf, 10, &val);
	if (err || val != 0)
		return -EINVAL;

	down_read(&zram->init_lock);
182 183
	if (init_done(zram)) {
		struct zram_meta *meta = zram->meta;
M
Minchan Kim 已提交
184 185
		atomic_long_set(&zram->stats.max_used_pages,
				zs_get_total_pages(meta->mem_pool));
186
	}
M
Minchan Kim 已提交
187 188 189 190 191
	up_read(&zram->init_lock);

	return len;
}

192 193 194 195 196
static ssize_t max_comp_streams_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int num;
	struct zram *zram = dev_to_zram(dev);
M
Minchan Kim 已提交
197
	int ret;
198

M
Minchan Kim 已提交
199 200 201
	ret = kstrtoint(buf, 0, &num);
	if (ret < 0)
		return ret;
202 203
	if (num < 1)
		return -EINVAL;
M
Minchan Kim 已提交
204

205 206
	down_write(&zram->init_lock);
	if (init_done(zram)) {
M
Minchan Kim 已提交
207
		if (!zcomp_set_max_streams(zram->comp, num)) {
208
			pr_info("Cannot change max compression streams\n");
M
Minchan Kim 已提交
209 210 211
			ret = -EINVAL;
			goto out;
		}
212
	}
M
Minchan Kim 已提交
213

214
	zram->max_comp_streams = num;
M
Minchan Kim 已提交
215 216
	ret = len;
out:
217
	up_write(&zram->init_lock);
M
Minchan Kim 已提交
218
	return ret;
219 220
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
static ssize_t comp_algorithm_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	size_t sz;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	sz = zcomp_available_show(zram->compressor, buf);
	up_read(&zram->init_lock);

	return sz;
}

static ssize_t comp_algorithm_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	struct zram *zram = dev_to_zram(dev);
	down_write(&zram->init_lock);
	if (init_done(zram)) {
		up_write(&zram->init_lock);
		pr_info("Can't change algorithm for initialized device\n");
		return -EBUSY;
	}
	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
	up_write(&zram->init_lock);
	return len;
}

M
Minchan Kim 已提交
249
/* flag operations needs meta->tb_lock */
M
Minchan Kim 已提交
250
static int zram_test_flag(struct zram_meta *meta, u32 index,
251
			enum zram_pageflags flag)
252
{
253
	return meta->table[index].value & BIT(flag);
254 255
}

M
Minchan Kim 已提交
256
static void zram_set_flag(struct zram_meta *meta, u32 index,
257
			enum zram_pageflags flag)
258
{
259
	meta->table[index].value |= BIT(flag);
260 261
}

M
Minchan Kim 已提交
262
static void zram_clear_flag(struct zram_meta *meta, u32 index,
263
			enum zram_pageflags flag)
264
{
265 266 267 268 269 270 271 272 273 274 275 276 277 278
	meta->table[index].value &= ~BIT(flag);
}

static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
{
	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
}

static void zram_set_obj_size(struct zram_meta *meta,
					u32 index, size_t size)
{
	unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;

	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
279 280
}

281 282 283 284 285 286 287 288
static inline int is_partial_io(struct bio_vec *bvec)
{
	return bvec->bv_len != PAGE_SIZE;
}

/*
 * Check if request is within bounds and aligned on zram logical blocks.
 */
289 290
static inline int valid_io_request(struct zram *zram,
		sector_t start, unsigned int size)
291
{
292
	u64 end, bound;
293

294
	/* unaligned request */
295
	if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
296
		return 0;
297
	if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
298 299
		return 0;

300
	end = start + (size >> SECTOR_SHIFT);
301 302
	bound = zram->disksize >> SECTOR_SHIFT;
	/* out of range range */
303
	if (unlikely(start >= bound || end > bound || start > end))
304 305 306 307 308 309
		return 0;

	/* I/O request is valid */
	return 1;
}

310
static void zram_meta_free(struct zram_meta *meta, u64 disksize)
311
{
312 313 314 315 316 317 318 319 320 321 322 323 324
	size_t num_pages = disksize >> PAGE_SHIFT;
	size_t index;

	/* Free all pages that are still in this zram device */
	for (index = 0; index < num_pages; index++) {
		unsigned long handle = meta->table[index].handle;

		if (!handle)
			continue;

		zs_free(meta->mem_pool, handle);
	}

325 326 327 328 329 330 331 332 333
	zs_destroy_pool(meta->mem_pool);
	vfree(meta->table);
	kfree(meta);
}

static struct zram_meta *zram_meta_alloc(u64 disksize)
{
	size_t num_pages;
	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
334

335
	if (!meta)
336
		return NULL;
337 338 339 340 341

	num_pages = disksize >> PAGE_SHIFT;
	meta->table = vzalloc(num_pages * sizeof(*meta->table));
	if (!meta->table) {
		pr_err("Error allocating zram address table\n");
342
		goto out_error;
343 344 345 346 347
	}

	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
	if (!meta->mem_pool) {
		pr_err("Error creating memory pool\n");
348
		goto out_error;
349 350 351 352
	}

	return meta;

353
out_error:
354 355
	vfree(meta->table);
	kfree(meta);
356
	return NULL;
357 358
}

359 360 361 362 363 364 365 366 367 368 369 370
static inline bool zram_meta_get(struct zram *zram)
{
	if (atomic_inc_not_zero(&zram->refcount))
		return true;
	return false;
}

static inline void zram_meta_put(struct zram *zram)
{
	atomic_dec(&zram->refcount);
}

371 372 373 374 375 376 377
static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
{
	if (*offset + bvec->bv_len >= PAGE_SIZE)
		(*index)++;
	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
}

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
static int page_zero_filled(void *ptr)
{
	unsigned int pos;
	unsigned long *page;

	page = (unsigned long *)ptr;

	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
		if (page[pos])
			return 0;
	}

	return 1;
}

393 394 395 396 397 398 399 400 401 402 403 404 405 406 407
static void handle_zero_page(struct bio_vec *bvec)
{
	struct page *page = bvec->bv_page;
	void *user_mem;

	user_mem = kmap_atomic(page);
	if (is_partial_io(bvec))
		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
	else
		clear_page(user_mem);
	kunmap_atomic(user_mem);

	flush_dcache_page(page);
}

408 409 410 411 412 413

/*
 * To protect concurrent access to the same index entry,
 * caller should hold this table index entry's bit_spinlock to
 * indicate this index entry is accessing.
 */
414
static void zram_free_page(struct zram *zram, size_t index)
415
{
M
Minchan Kim 已提交
416 417
	struct zram_meta *meta = zram->meta;
	unsigned long handle = meta->table[index].handle;
418

419
	if (unlikely(!handle)) {
420 421 422 423
		/*
		 * No memory is allocated for zero filled pages.
		 * Simply clear zero page flag.
		 */
M
Minchan Kim 已提交
424 425
		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
			zram_clear_flag(meta, index, ZRAM_ZERO);
426
			atomic64_dec(&zram->stats.zero_pages);
427 428 429 430
		}
		return;
	}

M
Minchan Kim 已提交
431
	zs_free(meta->mem_pool, handle);
432

433 434
	atomic64_sub(zram_get_obj_size(meta, index),
			&zram->stats.compr_data_size);
435
	atomic64_dec(&zram->stats.pages_stored);
436

M
Minchan Kim 已提交
437
	meta->table[index].handle = 0;
438
	zram_set_obj_size(meta, index, 0);
439 440
}

441
static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
442
{
443
	int ret = 0;
444
	unsigned char *cmem;
M
Minchan Kim 已提交
445
	struct zram_meta *meta = zram->meta;
M
Minchan Kim 已提交
446
	unsigned long handle;
M
Minchan Kim 已提交
447
	size_t size;
M
Minchan Kim 已提交
448

449
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
M
Minchan Kim 已提交
450
	handle = meta->table[index].handle;
451
	size = zram_get_obj_size(meta, index);
452

M
Minchan Kim 已提交
453
	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
454
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
455
		clear_page(mem);
456 457
		return 0;
	}
458

M
Minchan Kim 已提交
459
	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
M
Minchan Kim 已提交
460
	if (size == PAGE_SIZE)
461
		copy_page(mem, cmem);
462
	else
463
		ret = zcomp_decompress(zram->comp, cmem, size, mem);
M
Minchan Kim 已提交
464
	zs_unmap_object(meta->mem_pool, handle);
465
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
466

467
	/* Should NEVER happen. Return bio error if it does. */
468
	if (unlikely(ret)) {
469 470
		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
		return ret;
471
	}
472

473
	return 0;
474 475
}

476
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
477
			  u32 index, int offset)
478 479
{
	int ret;
480 481
	struct page *page;
	unsigned char *user_mem, *uncmem = NULL;
M
Minchan Kim 已提交
482
	struct zram_meta *meta = zram->meta;
483 484
	page = bvec->bv_page;

485
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
M
Minchan Kim 已提交
486 487
	if (unlikely(!meta->table[index].handle) ||
			zram_test_flag(meta, index, ZRAM_ZERO)) {
488
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
489
		handle_zero_page(bvec);
490 491
		return 0;
	}
492
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
493

494 495
	if (is_partial_io(bvec))
		/* Use  a temporary buffer to decompress the page */
496 497 498 499
		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);

	user_mem = kmap_atomic(page);
	if (!is_partial_io(bvec))
500 501 502 503 504 505 506
		uncmem = user_mem;

	if (!uncmem) {
		pr_info("Unable to allocate temp memory\n");
		ret = -ENOMEM;
		goto out_cleanup;
	}
507

508
	ret = zram_decompress_page(zram, uncmem, index);
509
	/* Should NEVER happen. Return bio error if it does. */
510
	if (unlikely(ret))
511
		goto out_cleanup;
512

513 514 515 516 517 518 519 520 521 522 523
	if (is_partial_io(bvec))
		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
				bvec->bv_len);

	flush_dcache_page(page);
	ret = 0;
out_cleanup:
	kunmap_atomic(user_mem);
	if (is_partial_io(bvec))
		kfree(uncmem);
	return ret;
524 525
}

M
Minchan Kim 已提交
526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
static inline void update_used_max(struct zram *zram,
					const unsigned long pages)
{
	int old_max, cur_max;

	old_max = atomic_long_read(&zram->stats.max_used_pages);

	do {
		cur_max = old_max;
		if (pages > cur_max)
			old_max = atomic_long_cmpxchg(
				&zram->stats.max_used_pages, cur_max, pages);
	} while (old_max != cur_max);
}

541 542
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
			   int offset)
543
{
544
	int ret = 0;
545
	size_t clen;
546
	unsigned long handle;
547
	struct page *page;
548
	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
M
Minchan Kim 已提交
549
	struct zram_meta *meta = zram->meta;
550
	struct zcomp_strm *zstrm;
551
	bool locked = false;
M
Minchan Kim 已提交
552
	unsigned long alloced_pages;
553

554
	page = bvec->bv_page;
555 556 557 558 559
	if (is_partial_io(bvec)) {
		/*
		 * This is a partial IO. We need to read the full page
		 * before to write the changes.
		 */
560
		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
561 562 563 564
		if (!uncmem) {
			ret = -ENOMEM;
			goto out;
		}
565
		ret = zram_decompress_page(zram, uncmem, index);
566
		if (ret)
567 568 569
			goto out;
	}

570
	zstrm = zcomp_strm_find(zram->comp);
571
	locked = true;
572
	user_mem = kmap_atomic(page);
573

574
	if (is_partial_io(bvec)) {
575 576
		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
		       bvec->bv_len);
577 578 579
		kunmap_atomic(user_mem);
		user_mem = NULL;
	} else {
580
		uncmem = user_mem;
581
	}
582 583

	if (page_zero_filled(uncmem)) {
584 585
		if (user_mem)
			kunmap_atomic(user_mem);
586
		/* Free memory associated with this sector now. */
587
		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
588
		zram_free_page(zram, index);
M
Minchan Kim 已提交
589
		zram_set_flag(meta, index, ZRAM_ZERO);
590
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
591

592
		atomic64_inc(&zram->stats.zero_pages);
593 594
		ret = 0;
		goto out;
595
	}
596

597
	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
598 599 600 601 602
	if (!is_partial_io(bvec)) {
		kunmap_atomic(user_mem);
		user_mem = NULL;
		uncmem = NULL;
	}
603

604
	if (unlikely(ret)) {
605
		pr_err("Compression failed! err=%d\n", ret);
606
		goto out;
607
	}
608
	src = zstrm->buffer;
609 610
	if (unlikely(clen > max_zpage_size)) {
		clen = PAGE_SIZE;
611 612
		if (is_partial_io(bvec))
			src = uncmem;
613
	}
614

M
Minchan Kim 已提交
615
	handle = zs_malloc(meta->mem_pool, clen);
616
	if (!handle) {
617 618
		pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
			index, clen);
619 620
		ret = -ENOMEM;
		goto out;
621
	}
M
Minchan Kim 已提交
622

M
Minchan Kim 已提交
623 624
	alloced_pages = zs_get_total_pages(meta->mem_pool);
	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
M
Minchan Kim 已提交
625 626 627 628 629
		zs_free(meta->mem_pool, handle);
		ret = -ENOMEM;
		goto out;
	}

M
Minchan Kim 已提交
630 631
	update_used_max(zram, alloced_pages);

M
Minchan Kim 已提交
632
	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
633

634
	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
635
		src = kmap_atomic(page);
636
		copy_page(cmem, src);
637
		kunmap_atomic(src);
638 639 640
	} else {
		memcpy(cmem, src, clen);
	}
641

642 643
	zcomp_strm_release(zram->comp, zstrm);
	locked = false;
M
Minchan Kim 已提交
644
	zs_unmap_object(meta->mem_pool, handle);
645

646 647 648 649
	/*
	 * Free memory associated with this sector
	 * before overwriting unused sectors.
	 */
650
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
651 652
	zram_free_page(zram, index);

M
Minchan Kim 已提交
653
	meta->table[index].handle = handle;
654 655
	zram_set_obj_size(meta, index, clen);
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
656

657
	/* Update stats */
658 659
	atomic64_add(clen, &zram->stats.compr_data_size);
	atomic64_inc(&zram->stats.pages_stored);
660
out:
661
	if (locked)
662
		zcomp_strm_release(zram->comp, zstrm);
663 664
	if (is_partial_io(bvec))
		kfree(uncmem);
665
	return ret;
666 667 668
}

static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
669
			int offset, int rw)
670
{
671
	int ret;
672

673 674
	if (rw == READ) {
		atomic64_inc(&zram->stats.num_reads);
675
		ret = zram_bvec_read(zram, bvec, index, offset);
676 677
	} else {
		atomic64_inc(&zram->stats.num_writes);
678
		ret = zram_bvec_write(zram, bvec, index, offset);
679
	}
680

681 682 683 684 685 686 687
	if (unlikely(ret)) {
		if (rw == READ)
			atomic64_inc(&zram->stats.failed_reads);
		else
			atomic64_inc(&zram->stats.failed_writes);
	}

688
	return ret;
689 690
}

J
Joonsoo Kim 已提交
691 692 693 694 695 696 697 698 699
/*
 * zram_bio_discard - handler on discard request
 * @index: physical block index in PAGE_SIZE units
 * @offset: byte offset within physical block
 */
static void zram_bio_discard(struct zram *zram, u32 index,
			     int offset, struct bio *bio)
{
	size_t n = bio->bi_iter.bi_size;
700
	struct zram_meta *meta = zram->meta;
J
Joonsoo Kim 已提交
701 702 703 704 705 706 707 708 709 710 711 712

	/*
	 * zram manages data in physical block size units. Because logical block
	 * size isn't identical with physical block size on some arch, we
	 * could get a discard request pointing to a specific offset within a
	 * certain physical block.  Although we can handle this request by
	 * reading that physiclal block and decompressing and partially zeroing
	 * and re-compressing and then re-storing it, this isn't reasonable
	 * because our intent with a discard request is to save memory.  So
	 * skipping this logical block is appropriate here.
	 */
	if (offset) {
713
		if (n <= (PAGE_SIZE - offset))
J
Joonsoo Kim 已提交
714 715
			return;

716
		n -= (PAGE_SIZE - offset);
J
Joonsoo Kim 已提交
717 718 719 720
		index++;
	}

	while (n >= PAGE_SIZE) {
721
		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
J
Joonsoo Kim 已提交
722
		zram_free_page(zram, index);
723
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
724
		atomic64_inc(&zram->stats.notify_free);
J
Joonsoo Kim 已提交
725 726 727 728 729
		index++;
		n -= PAGE_SIZE;
	}
}

730
static void zram_reset_device(struct zram *zram)
731
{
732 733 734 735
	struct zram_meta *meta;
	struct zcomp *comp;
	u64 disksize;

736
	down_write(&zram->init_lock);
M
Minchan Kim 已提交
737 738 739

	zram->limit_pages = 0;

740
	if (!init_done(zram)) {
741
		up_write(&zram->init_lock);
742
		return;
743
	}
744

745 746 747 748 749 750 751 752 753 754 755 756 757 758 759
	meta = zram->meta;
	comp = zram->comp;
	disksize = zram->disksize;
	/*
	 * Refcount will go down to 0 eventually and r/w handler
	 * cannot handle further I/O so it will bail out by
	 * check zram_meta_get.
	 */
	zram_meta_put(zram);
	/*
	 * We want to free zram_meta in process context to avoid
	 * deadlock between reclaim path and any other locks.
	 */
	wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);

760 761 762
	/* Reset stats */
	memset(&zram->stats, 0, sizeof(zram->stats));
	zram->disksize = 0;
763
	zram->max_comp_streams = 1;
764 765
	set_capacity(zram->disk, 0);

766
	up_write(&zram->init_lock);
767 768 769
	/* I/O operation under all of CPU are done so let's free */
	zram_meta_free(meta, disksize);
	zcomp_destroy(comp);
770 771 772 773 774 775
}

static ssize_t disksize_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	u64 disksize;
776
	struct zcomp *comp;
777 778
	struct zram_meta *meta;
	struct zram *zram = dev_to_zram(dev);
779
	int err;
780 781 782 783 784 785 786

	disksize = memparse(buf, NULL);
	if (!disksize)
		return -EINVAL;

	disksize = PAGE_ALIGN(disksize);
	meta = zram_meta_alloc(disksize);
787 788
	if (!meta)
		return -ENOMEM;
789

790
	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
791
	if (IS_ERR(comp)) {
792 793
		pr_info("Cannot initialise %s compressing backend\n",
				zram->compressor);
794 795
		err = PTR_ERR(comp);
		goto out_free_meta;
796 797
	}

798
	down_write(&zram->init_lock);
799
	if (init_done(zram)) {
800
		pr_info("Cannot change disksize for initialized device\n");
801
		err = -EBUSY;
802
		goto out_destroy_comp;
803 804
	}

805 806
	init_waitqueue_head(&zram->io_done);
	atomic_set(&zram->refcount, 1);
807
	zram->meta = meta;
808
	zram->comp = comp;
809 810 811
	zram->disksize = disksize;
	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
	up_write(&zram->init_lock);
812 813 814 815 816 817 818 819

	/*
	 * Revalidate disk out of the init_lock to avoid lockdep splat.
	 * It's okay because disk's capacity is protected by init_lock
	 * so that revalidate_disk always sees up-to-date capacity.
	 */
	revalidate_disk(zram->disk);

820
	return len;
821

822 823 824 825
out_destroy_comp:
	up_write(&zram->init_lock);
	zcomp_destroy(comp);
out_free_meta:
826
	zram_meta_free(meta, disksize);
827
	return err;
828 829 830 831 832 833 834 835 836 837 838 839 840
}

static ssize_t reset_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int ret;
	unsigned short do_reset;
	struct zram *zram;
	struct block_device *bdev;

	zram = dev_to_zram(dev);
	bdev = bdget_disk(zram->disk, 0);

841 842 843
	if (!bdev)
		return -ENOMEM;

844
	mutex_lock(&bdev->bd_mutex);
845
	/* Do not reset an active device! */
846
	if (bdev->bd_openers) {
847 848 849
		ret = -EBUSY;
		goto out;
	}
850 851 852

	ret = kstrtou16(buf, 10, &do_reset);
	if (ret)
853
		goto out;
854

855 856 857 858
	if (!do_reset) {
		ret = -EINVAL;
		goto out;
	}
859 860

	/* Make sure all pending I/O is finished */
861
	fsync_bdev(bdev);
862 863 864 865
	zram_reset_device(zram);

	mutex_unlock(&bdev->bd_mutex);
	revalidate_disk(zram->disk);
866
	bdput(bdev);
867 868

	return len;
869 870

out:
871
	mutex_unlock(&bdev->bd_mutex);
872 873
	bdput(bdev);
	return ret;
874 875
}

876
static void __zram_make_request(struct zram *zram, struct bio *bio)
877
{
878
	int offset, rw;
879
	u32 index;
880 881
	struct bio_vec bvec;
	struct bvec_iter iter;
882

883 884 885
	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
	offset = (bio->bi_iter.bi_sector &
		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
886

J
Joonsoo Kim 已提交
887 888 889 890 891 892
	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
		zram_bio_discard(zram, index, offset, bio);
		bio_endio(bio, 0);
		return;
	}

893
	rw = bio_data_dir(bio);
894
	bio_for_each_segment(bvec, bio, iter) {
895 896
		int max_transfer_size = PAGE_SIZE - offset;

897
		if (bvec.bv_len > max_transfer_size) {
898 899 900 901 902 903
			/*
			 * zram_bvec_rw() can only make operation on a single
			 * zram page. Split the bio vector.
			 */
			struct bio_vec bv;

904
			bv.bv_page = bvec.bv_page;
905
			bv.bv_len = max_transfer_size;
906
			bv.bv_offset = bvec.bv_offset;
907

908
			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
909 910
				goto out;

911
			bv.bv_len = bvec.bv_len - max_transfer_size;
912
			bv.bv_offset += max_transfer_size;
913
			if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
914 915
				goto out;
		} else
916
			if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
917 918
				goto out;

919
		update_position(&index, &offset, &bvec);
920
	}
921 922 923

	set_bit(BIO_UPTODATE, &bio->bi_flags);
	bio_endio(bio, 0);
924
	return;
925 926 927 928 929 930

out:
	bio_io_error(bio);
}

/*
931
 * Handler function for all zram I/O requests.
932
 */
933
static void zram_make_request(struct request_queue *queue, struct bio *bio)
934
{
935
	struct zram *zram = queue->queuedata;
936

937
	if (unlikely(!zram_meta_get(zram)))
938
		goto error;
939

940 941
	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
					bio->bi_iter.bi_size)) {
942
		atomic64_inc(&zram->stats.invalid_io);
943
		goto put_zram;
944 945
	}

946
	__zram_make_request(zram, bio);
947
	zram_meta_put(zram);
948
	return;
949 950
put_zram:
	zram_meta_put(zram);
951 952
error:
	bio_io_error(bio);
953 954
}

N
Nitin Gupta 已提交
955 956
static void zram_slot_free_notify(struct block_device *bdev,
				unsigned long index)
957
{
958
	struct zram *zram;
959
	struct zram_meta *meta;
960

961
	zram = bdev->bd_disk->private_data;
962
	meta = zram->meta;
963

964
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
965
	zram_free_page(zram, index);
966
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
967
	atomic64_inc(&zram->stats.notify_free);
968 969
}

970 971 972
static int zram_rw_page(struct block_device *bdev, sector_t sector,
		       struct page *page, int rw)
{
973
	int offset, err = -EIO;
974 975 976 977 978
	u32 index;
	struct zram *zram;
	struct bio_vec bv;

	zram = bdev->bd_disk->private_data;
979 980 981
	if (unlikely(!zram_meta_get(zram)))
		goto out;

982 983
	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
		atomic64_inc(&zram->stats.invalid_io);
984 985
		err = -EINVAL;
		goto put_zram;
986 987 988 989 990 991 992 993 994 995
	}

	index = sector >> SECTORS_PER_PAGE_SHIFT;
	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;

	bv.bv_page = page;
	bv.bv_len = PAGE_SIZE;
	bv.bv_offset = 0;

	err = zram_bvec_rw(zram, &bv, index, offset, rw);
996 997 998
put_zram:
	zram_meta_put(zram);
out:
999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
	/*
	 * If I/O fails, just return error(ie, non-zero) without
	 * calling page_endio.
	 * It causes resubmit the I/O with bio request by upper functions
	 * of rw_page(e.g., swap_readpage, __swap_writepage) and
	 * bio->bi_end_io does things to handle the error
	 * (e.g., SetPageError, set_page_dirty and extra works).
	 */
	if (err == 0)
		page_endio(page, rw, 0);
	return err;
}

1012 1013
static const struct block_device_operations zram_devops = {
	.swap_slot_free_notify = zram_slot_free_notify,
1014
	.rw_page = zram_rw_page,
1015
	.owner = THIS_MODULE
1016 1017
};

1018 1019 1020 1021 1022 1023 1024 1025 1026
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
static DEVICE_ATTR_WO(reset);
static DEVICE_ATTR_RO(orig_data_size);
static DEVICE_ATTR_RO(mem_used_total);
static DEVICE_ATTR_RW(mem_limit);
static DEVICE_ATTR_RW(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
1027

1028 1029
ZRAM_ATTR_RO(num_reads);
ZRAM_ATTR_RO(num_writes);
1030 1031
ZRAM_ATTR_RO(failed_reads);
ZRAM_ATTR_RO(failed_writes);
1032 1033 1034 1035 1036
ZRAM_ATTR_RO(invalid_io);
ZRAM_ATTR_RO(notify_free);
ZRAM_ATTR_RO(zero_pages);
ZRAM_ATTR_RO(compr_data_size);

1037 1038 1039 1040 1041 1042
static struct attribute *zram_disk_attrs[] = {
	&dev_attr_disksize.attr,
	&dev_attr_initstate.attr,
	&dev_attr_reset.attr,
	&dev_attr_num_reads.attr,
	&dev_attr_num_writes.attr,
1043 1044
	&dev_attr_failed_reads.attr,
	&dev_attr_failed_writes.attr,
1045 1046 1047 1048 1049 1050
	&dev_attr_invalid_io.attr,
	&dev_attr_notify_free.attr,
	&dev_attr_zero_pages.attr,
	&dev_attr_orig_data_size.attr,
	&dev_attr_compr_data_size.attr,
	&dev_attr_mem_used_total.attr,
M
Minchan Kim 已提交
1051
	&dev_attr_mem_limit.attr,
M
Minchan Kim 已提交
1052
	&dev_attr_mem_used_max.attr,
1053
	&dev_attr_max_comp_streams.attr,
1054
	&dev_attr_comp_algorithm.attr,
1055 1056 1057 1058 1059 1060 1061
	NULL,
};

static struct attribute_group zram_disk_attr_group = {
	.attrs = zram_disk_attrs,
};

1062
static int create_device(struct zram *zram, int device_id)
1063
{
1064
	struct request_queue *queue;
1065
	int ret = -ENOMEM;
1066

1067
	init_rwsem(&zram->init_lock);
1068

1069 1070
	queue = blk_alloc_queue(GFP_KERNEL);
	if (!queue) {
1071 1072
		pr_err("Error allocating disk queue for device %d\n",
			device_id);
1073
		goto out;
1074 1075
	}

1076
	blk_queue_make_request(queue, zram_make_request);
1077 1078

	 /* gendisk structure */
1079 1080
	zram->disk = alloc_disk(1);
	if (!zram->disk) {
1081
		pr_warn("Error allocating disk structure for device %d\n",
1082
			device_id);
1083
		goto out_free_queue;
1084 1085
	}

1086 1087 1088
	zram->disk->major = zram_major;
	zram->disk->first_minor = device_id;
	zram->disk->fops = &zram_devops;
1089 1090
	zram->disk->queue = queue;
	zram->disk->queue->queuedata = zram;
1091 1092
	zram->disk->private_data = zram;
	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1093

1094
	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1095
	set_capacity(zram->disk, 0);
1096 1097
	/* zram devices sort of resembles non-rotational disks */
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
1098
	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1099 1100 1101 1102
	/*
	 * To ensure that we always get PAGE_SIZE aligned
	 * and n*PAGE_SIZED sized I/O requests.
	 */
1103
	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1104 1105
	blk_queue_logical_block_size(zram->disk->queue,
					ZRAM_LOGICAL_BLOCK_SIZE);
1106 1107
	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
J
Joonsoo Kim 已提交
1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
	zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
	/*
	 * zram_bio_discard() will clear all logical blocks if logical block
	 * size is identical with physical block size(PAGE_SIZE). But if it is
	 * different, we will skip discarding some parts of logical blocks in
	 * the part of the request range which isn't aligned to physical block
	 * size.  So we can't ensure that all discarded logical blocks are
	 * zeroed.
	 */
	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
		zram->disk->queue->limits.discard_zeroes_data = 1;
	else
		zram->disk->queue->limits.discard_zeroes_data = 0;
	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
1123

1124
	add_disk(zram->disk);
1125

1126 1127 1128
	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
				&zram_disk_attr_group);
	if (ret < 0) {
1129
		pr_warn("Error creating sysfs group");
1130
		goto out_free_disk;
1131
	}
1132
	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1133
	zram->meta = NULL;
1134
	zram->max_comp_streams = 1;
1135
	return 0;
1136

1137 1138 1139 1140
out_free_disk:
	del_gendisk(zram->disk);
	put_disk(zram->disk);
out_free_queue:
1141
	blk_cleanup_queue(queue);
1142 1143
out:
	return ret;
1144 1145
}

1146
static void destroy_devices(unsigned int nr)
1147
{
1148 1149
	struct zram *zram;
	unsigned int i;
1150

1151 1152 1153 1154 1155 1156 1157 1158
	for (i = 0; i < nr; i++) {
		zram = &zram_devices[i];
		/*
		 * Remove sysfs first, so no one will perform a disksize
		 * store while we destroy the devices
		 */
		sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
				&zram_disk_attr_group);
1159

1160 1161
		zram_reset_device(zram);

1162
		blk_cleanup_queue(zram->disk->queue);
1163 1164 1165 1166 1167 1168 1169
		del_gendisk(zram->disk);
		put_disk(zram->disk);
	}

	kfree(zram_devices);
	unregister_blkdev(zram_major, "zram");
	pr_info("Destroyed %u device(s)\n", nr);
1170 1171
}

1172
static int __init zram_init(void)
1173
{
1174
	int ret, dev_id;
1175

1176
	if (num_devices > max_num_devices) {
1177
		pr_warn("Invalid value for num_devices: %u\n",
1178
				num_devices);
1179
		return -EINVAL;
1180 1181
	}

1182 1183
	zram_major = register_blkdev(0, "zram");
	if (zram_major <= 0) {
1184
		pr_warn("Unable to get major number\n");
1185
		return -EBUSY;
1186 1187 1188
	}

	/* Allocate the device array and initialize each one */
1189
	zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
1190
	if (!zram_devices) {
1191 1192
		unregister_blkdev(zram_major, "zram");
		return -ENOMEM;
1193
	}
1194

1195
	for (dev_id = 0; dev_id < num_devices; dev_id++) {
1196
		ret = create_device(&zram_devices[dev_id], dev_id);
1197
		if (ret)
1198
			goto out_error;
1199 1200
	}

1201
	pr_info("Created %u device(s)\n", num_devices);
1202
	return 0;
1203

1204 1205
out_error:
	destroy_devices(dev_id);
1206 1207 1208
	return ret;
}

1209
static void __exit zram_exit(void)
1210
{
1211
	destroy_devices(num_devices);
1212 1213
}

1214 1215
module_init(zram_init);
module_exit(zram_exit);
1216

1217 1218 1219
module_param(num_devices, uint, 0);
MODULE_PARM_DESC(num_devices, "Number of zram devices");

1220 1221
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1222
MODULE_DESCRIPTION("Compressed RAM Block Device");