zram_drv.c 28.0 KB
Newer Older
1
/*
2
 * Compressed RAM block device
3
 *
4
 * Copyright (C) 2008, 2009, 2010  Nitin Gupta
M
Minchan Kim 已提交
5
 *               2012, 2013 Minchan Kim
6 7 8 9 10 11 12 13 14
 *
 * This code is released using a dual license strategy: BSD/GPL
 * You can choose the licence that better fits your requirements.
 *
 * Released under the terms of 3-clause BSD License
 * Released under the terms of GNU General Public License Version 2.0
 *
 */

15
#define KMSG_COMPONENT "zram"
16 17
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt

18 19 20 21
#ifdef CONFIG_ZRAM_DEBUG
#define DEBUG
#endif

22 23
#include <linux/module.h>
#include <linux/kernel.h>
24
#include <linux/bio.h>
25 26 27 28 29 30
#include <linux/bitops.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/device.h>
#include <linux/genhd.h>
#include <linux/highmem.h>
31
#include <linux/slab.h>
32 33
#include <linux/string.h>
#include <linux/vmalloc.h>
34
#include <linux/err.h>
35

36
#include "zram_drv.h"
37 38

/* Globals */
39
static int zram_major;
40
static struct zram *zram_devices;
41
static const char *default_compressor = "lzo";
42 43

/* Module params (documentation at end) */
44
static unsigned int num_devices = 1;
45

46
#define ZRAM_ATTR_RO(name)						\
47
static ssize_t name##_show(struct device *d,		\
48 49 50
				struct device_attribute *attr, char *b)	\
{									\
	struct zram *zram = dev_to_zram(d);				\
51
	return scnprintf(b, PAGE_SIZE, "%llu\n",			\
52 53
		(u64)atomic64_read(&zram->stats.name));			\
}									\
54
static DEVICE_ATTR_RO(name);
55

56 57 58 59 60
static inline int init_done(struct zram *zram)
{
	return zram->meta != NULL;
}

61 62 63 64 65 66 67 68 69 70
static inline struct zram *dev_to_zram(struct device *dev)
{
	return (struct zram *)dev_to_disk(dev)->private_data;
}

static ssize_t disksize_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct zram *zram = dev_to_zram(dev);

71
	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
72 73 74 75 76
}

static ssize_t initstate_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
77
	u32 val;
78 79
	struct zram *zram = dev_to_zram(dev);

80 81 82
	down_read(&zram->init_lock);
	val = init_done(zram);
	up_read(&zram->init_lock);
83

84
	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
85 86 87 88 89 90 91
}

static ssize_t orig_data_size_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	struct zram *zram = dev_to_zram(dev);

92
	return scnprintf(buf, PAGE_SIZE, "%llu\n",
93
		(u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
94 95 96 97 98 99 100 101 102
}

static ssize_t mem_used_total_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val = 0;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
103 104
	if (init_done(zram)) {
		struct zram_meta *meta = zram->meta;
105
		val = zs_get_total_pages(meta->mem_pool);
106
	}
107 108
	up_read(&zram->init_lock);

109
	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
110 111
}

112 113 114 115 116 117 118 119 120 121
static ssize_t max_comp_streams_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	int val;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	val = zram->max_comp_streams;
	up_read(&zram->init_lock);

122
	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
123 124
}

M
Minchan Kim 已提交
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
static ssize_t mem_limit_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	val = zram->limit_pages;
	up_read(&zram->init_lock);

	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}

static ssize_t mem_limit_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	u64 limit;
	char *tmp;
	struct zram *zram = dev_to_zram(dev);

	limit = memparse(buf, &tmp);
	if (buf == tmp) /* no chars parsed, invalid input */
		return -EINVAL;

	down_write(&zram->init_lock);
	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
	up_write(&zram->init_lock);

	return len;
}

M
Minchan Kim 已提交
156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
static ssize_t mem_used_max_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	u64 val = 0;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	if (init_done(zram))
		val = atomic_long_read(&zram->stats.max_used_pages);
	up_read(&zram->init_lock);

	return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
}

static ssize_t mem_used_max_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int err;
	unsigned long val;
	struct zram *zram = dev_to_zram(dev);

	err = kstrtoul(buf, 10, &val);
	if (err || val != 0)
		return -EINVAL;

	down_read(&zram->init_lock);
182 183
	if (init_done(zram)) {
		struct zram_meta *meta = zram->meta;
M
Minchan Kim 已提交
184 185
		atomic_long_set(&zram->stats.max_used_pages,
				zs_get_total_pages(meta->mem_pool));
186
	}
M
Minchan Kim 已提交
187 188 189 190 191
	up_read(&zram->init_lock);

	return len;
}

192 193 194 195 196
static ssize_t max_comp_streams_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int num;
	struct zram *zram = dev_to_zram(dev);
M
Minchan Kim 已提交
197
	int ret;
198

M
Minchan Kim 已提交
199 200 201
	ret = kstrtoint(buf, 0, &num);
	if (ret < 0)
		return ret;
202 203
	if (num < 1)
		return -EINVAL;
M
Minchan Kim 已提交
204

205 206
	down_write(&zram->init_lock);
	if (init_done(zram)) {
M
Minchan Kim 已提交
207
		if (!zcomp_set_max_streams(zram->comp, num)) {
208
			pr_info("Cannot change max compression streams\n");
M
Minchan Kim 已提交
209 210 211
			ret = -EINVAL;
			goto out;
		}
212
	}
M
Minchan Kim 已提交
213

214
	zram->max_comp_streams = num;
M
Minchan Kim 已提交
215 216
	ret = len;
out:
217
	up_write(&zram->init_lock);
M
Minchan Kim 已提交
218
	return ret;
219 220
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
static ssize_t comp_algorithm_show(struct device *dev,
		struct device_attribute *attr, char *buf)
{
	size_t sz;
	struct zram *zram = dev_to_zram(dev);

	down_read(&zram->init_lock);
	sz = zcomp_available_show(zram->compressor, buf);
	up_read(&zram->init_lock);

	return sz;
}

static ssize_t comp_algorithm_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	struct zram *zram = dev_to_zram(dev);
	down_write(&zram->init_lock);
	if (init_done(zram)) {
		up_write(&zram->init_lock);
		pr_info("Can't change algorithm for initialized device\n");
		return -EBUSY;
	}
	strlcpy(zram->compressor, buf, sizeof(zram->compressor));
	up_write(&zram->init_lock);
	return len;
}

M
Minchan Kim 已提交
249
/* flag operations needs meta->tb_lock */
M
Minchan Kim 已提交
250
static int zram_test_flag(struct zram_meta *meta, u32 index,
251
			enum zram_pageflags flag)
252
{
253
	return meta->table[index].value & BIT(flag);
254 255
}

M
Minchan Kim 已提交
256
static void zram_set_flag(struct zram_meta *meta, u32 index,
257
			enum zram_pageflags flag)
258
{
259
	meta->table[index].value |= BIT(flag);
260 261
}

M
Minchan Kim 已提交
262
static void zram_clear_flag(struct zram_meta *meta, u32 index,
263
			enum zram_pageflags flag)
264
{
265 266 267 268 269 270 271 272 273 274 275 276 277 278
	meta->table[index].value &= ~BIT(flag);
}

static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
{
	return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
}

static void zram_set_obj_size(struct zram_meta *meta,
					u32 index, size_t size)
{
	unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;

	meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
279 280
}

281 282 283 284 285 286 287 288
static inline int is_partial_io(struct bio_vec *bvec)
{
	return bvec->bv_len != PAGE_SIZE;
}

/*
 * Check if request is within bounds and aligned on zram logical blocks.
 */
289 290
static inline int valid_io_request(struct zram *zram,
		sector_t start, unsigned int size)
291
{
292
	u64 end, bound;
293

294
	/* unaligned request */
295
	if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
296
		return 0;
297
	if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
298 299
		return 0;

300
	end = start + (size >> SECTOR_SHIFT);
301 302
	bound = zram->disksize >> SECTOR_SHIFT;
	/* out of range range */
303
	if (unlikely(start >= bound || end > bound || start > end))
304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
		return 0;

	/* I/O request is valid */
	return 1;
}

static void zram_meta_free(struct zram_meta *meta)
{
	zs_destroy_pool(meta->mem_pool);
	vfree(meta->table);
	kfree(meta);
}

static struct zram_meta *zram_meta_alloc(u64 disksize)
{
	size_t num_pages;
	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
321

322
	if (!meta)
323
		return NULL;
324 325 326 327 328

	num_pages = disksize >> PAGE_SHIFT;
	meta->table = vzalloc(num_pages * sizeof(*meta->table));
	if (!meta->table) {
		pr_err("Error allocating zram address table\n");
329
		goto out_error;
330 331 332 333 334
	}

	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
	if (!meta->mem_pool) {
		pr_err("Error creating memory pool\n");
335
		goto out_error;
336 337 338 339
	}

	return meta;

340
out_error:
341 342
	vfree(meta->table);
	kfree(meta);
343
	return NULL;
344 345 346 347 348 349 350 351 352
}

static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
{
	if (*offset + bvec->bv_len >= PAGE_SIZE)
		(*index)++;
	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
}

353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
static int page_zero_filled(void *ptr)
{
	unsigned int pos;
	unsigned long *page;

	page = (unsigned long *)ptr;

	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
		if (page[pos])
			return 0;
	}

	return 1;
}

368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
static void handle_zero_page(struct bio_vec *bvec)
{
	struct page *page = bvec->bv_page;
	void *user_mem;

	user_mem = kmap_atomic(page);
	if (is_partial_io(bvec))
		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
	else
		clear_page(user_mem);
	kunmap_atomic(user_mem);

	flush_dcache_page(page);
}

383 384 385 386 387 388

/*
 * To protect concurrent access to the same index entry,
 * caller should hold this table index entry's bit_spinlock to
 * indicate this index entry is accessing.
 */
389
static void zram_free_page(struct zram *zram, size_t index)
390
{
M
Minchan Kim 已提交
391 392
	struct zram_meta *meta = zram->meta;
	unsigned long handle = meta->table[index].handle;
393

394
	if (unlikely(!handle)) {
395 396 397 398
		/*
		 * No memory is allocated for zero filled pages.
		 * Simply clear zero page flag.
		 */
M
Minchan Kim 已提交
399 400
		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
			zram_clear_flag(meta, index, ZRAM_ZERO);
401
			atomic64_dec(&zram->stats.zero_pages);
402 403 404 405
		}
		return;
	}

M
Minchan Kim 已提交
406
	zs_free(meta->mem_pool, handle);
407

408 409
	atomic64_sub(zram_get_obj_size(meta, index),
			&zram->stats.compr_data_size);
410
	atomic64_dec(&zram->stats.pages_stored);
411

M
Minchan Kim 已提交
412
	meta->table[index].handle = 0;
413
	zram_set_obj_size(meta, index, 0);
414 415
}

416
static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
417
{
418
	int ret = 0;
419
	unsigned char *cmem;
M
Minchan Kim 已提交
420
	struct zram_meta *meta = zram->meta;
M
Minchan Kim 已提交
421
	unsigned long handle;
M
Minchan Kim 已提交
422
	size_t size;
M
Minchan Kim 已提交
423

424
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
M
Minchan Kim 已提交
425
	handle = meta->table[index].handle;
426
	size = zram_get_obj_size(meta, index);
427

M
Minchan Kim 已提交
428
	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
429
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
430
		clear_page(mem);
431 432
		return 0;
	}
433

M
Minchan Kim 已提交
434
	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
M
Minchan Kim 已提交
435
	if (size == PAGE_SIZE)
436
		copy_page(mem, cmem);
437
	else
438
		ret = zcomp_decompress(zram->comp, cmem, size, mem);
M
Minchan Kim 已提交
439
	zs_unmap_object(meta->mem_pool, handle);
440
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
441

442
	/* Should NEVER happen. Return bio error if it does. */
443
	if (unlikely(ret)) {
444 445
		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
		return ret;
446
	}
447

448
	return 0;
449 450
}

451
static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
452
			  u32 index, int offset)
453 454
{
	int ret;
455 456
	struct page *page;
	unsigned char *user_mem, *uncmem = NULL;
M
Minchan Kim 已提交
457
	struct zram_meta *meta = zram->meta;
458 459
	page = bvec->bv_page;

460
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
M
Minchan Kim 已提交
461 462
	if (unlikely(!meta->table[index].handle) ||
			zram_test_flag(meta, index, ZRAM_ZERO)) {
463
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
464
		handle_zero_page(bvec);
465 466
		return 0;
	}
467
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
468

469 470
	if (is_partial_io(bvec))
		/* Use  a temporary buffer to decompress the page */
471 472 473 474
		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);

	user_mem = kmap_atomic(page);
	if (!is_partial_io(bvec))
475 476 477 478 479 480 481
		uncmem = user_mem;

	if (!uncmem) {
		pr_info("Unable to allocate temp memory\n");
		ret = -ENOMEM;
		goto out_cleanup;
	}
482

483
	ret = zram_decompress_page(zram, uncmem, index);
484
	/* Should NEVER happen. Return bio error if it does. */
485
	if (unlikely(ret))
486
		goto out_cleanup;
487

488 489 490 491 492 493 494 495 496 497 498
	if (is_partial_io(bvec))
		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
				bvec->bv_len);

	flush_dcache_page(page);
	ret = 0;
out_cleanup:
	kunmap_atomic(user_mem);
	if (is_partial_io(bvec))
		kfree(uncmem);
	return ret;
499 500
}

M
Minchan Kim 已提交
501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
static inline void update_used_max(struct zram *zram,
					const unsigned long pages)
{
	int old_max, cur_max;

	old_max = atomic_long_read(&zram->stats.max_used_pages);

	do {
		cur_max = old_max;
		if (pages > cur_max)
			old_max = atomic_long_cmpxchg(
				&zram->stats.max_used_pages, cur_max, pages);
	} while (old_max != cur_max);
}

516 517
static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
			   int offset)
518
{
519
	int ret = 0;
520
	size_t clen;
521
	unsigned long handle;
522
	struct page *page;
523
	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
M
Minchan Kim 已提交
524
	struct zram_meta *meta = zram->meta;
525
	struct zcomp_strm *zstrm;
526
	bool locked = false;
M
Minchan Kim 已提交
527
	unsigned long alloced_pages;
528

529
	page = bvec->bv_page;
530 531 532 533 534
	if (is_partial_io(bvec)) {
		/*
		 * This is a partial IO. We need to read the full page
		 * before to write the changes.
		 */
535
		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
536 537 538 539
		if (!uncmem) {
			ret = -ENOMEM;
			goto out;
		}
540
		ret = zram_decompress_page(zram, uncmem, index);
541
		if (ret)
542 543 544
			goto out;
	}

545
	zstrm = zcomp_strm_find(zram->comp);
546
	locked = true;
547
	user_mem = kmap_atomic(page);
548

549
	if (is_partial_io(bvec)) {
550 551
		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
		       bvec->bv_len);
552 553 554
		kunmap_atomic(user_mem);
		user_mem = NULL;
	} else {
555
		uncmem = user_mem;
556
	}
557 558

	if (page_zero_filled(uncmem)) {
559 560
		if (user_mem)
			kunmap_atomic(user_mem);
561
		/* Free memory associated with this sector now. */
562
		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
563
		zram_free_page(zram, index);
M
Minchan Kim 已提交
564
		zram_set_flag(meta, index, ZRAM_ZERO);
565
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
566

567
		atomic64_inc(&zram->stats.zero_pages);
568 569
		ret = 0;
		goto out;
570
	}
571

572
	ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
573 574 575 576 577
	if (!is_partial_io(bvec)) {
		kunmap_atomic(user_mem);
		user_mem = NULL;
		uncmem = NULL;
	}
578

579
	if (unlikely(ret)) {
580
		pr_err("Compression failed! err=%d\n", ret);
581
		goto out;
582
	}
583
	src = zstrm->buffer;
584 585
	if (unlikely(clen > max_zpage_size)) {
		clen = PAGE_SIZE;
586 587
		if (is_partial_io(bvec))
			src = uncmem;
588
	}
589

M
Minchan Kim 已提交
590
	handle = zs_malloc(meta->mem_pool, clen);
591
	if (!handle) {
592 593
		pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
			index, clen);
594 595
		ret = -ENOMEM;
		goto out;
596
	}
M
Minchan Kim 已提交
597

M
Minchan Kim 已提交
598 599
	alloced_pages = zs_get_total_pages(meta->mem_pool);
	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
M
Minchan Kim 已提交
600 601 602 603 604
		zs_free(meta->mem_pool, handle);
		ret = -ENOMEM;
		goto out;
	}

M
Minchan Kim 已提交
605 606
	update_used_max(zram, alloced_pages);

M
Minchan Kim 已提交
607
	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
608

609
	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
610
		src = kmap_atomic(page);
611
		copy_page(cmem, src);
612
		kunmap_atomic(src);
613 614 615
	} else {
		memcpy(cmem, src, clen);
	}
616

617 618
	zcomp_strm_release(zram->comp, zstrm);
	locked = false;
M
Minchan Kim 已提交
619
	zs_unmap_object(meta->mem_pool, handle);
620

621 622 623 624
	/*
	 * Free memory associated with this sector
	 * before overwriting unused sectors.
	 */
625
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
626 627
	zram_free_page(zram, index);

M
Minchan Kim 已提交
628
	meta->table[index].handle = handle;
629 630
	zram_set_obj_size(meta, index, clen);
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
631

632
	/* Update stats */
633 634
	atomic64_add(clen, &zram->stats.compr_data_size);
	atomic64_inc(&zram->stats.pages_stored);
635
out:
636
	if (locked)
637
		zcomp_strm_release(zram->comp, zstrm);
638 639
	if (is_partial_io(bvec))
		kfree(uncmem);
640
	return ret;
641 642 643
}

static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
644
			int offset, int rw)
645
{
646
	int ret;
647

648 649
	if (rw == READ) {
		atomic64_inc(&zram->stats.num_reads);
650
		ret = zram_bvec_read(zram, bvec, index, offset);
651 652
	} else {
		atomic64_inc(&zram->stats.num_writes);
653
		ret = zram_bvec_write(zram, bvec, index, offset);
654
	}
655

656 657 658 659 660 661 662
	if (unlikely(ret)) {
		if (rw == READ)
			atomic64_inc(&zram->stats.failed_reads);
		else
			atomic64_inc(&zram->stats.failed_writes);
	}

663
	return ret;
664 665
}

J
Joonsoo Kim 已提交
666 667 668 669 670 671 672 673 674
/*
 * zram_bio_discard - handler on discard request
 * @index: physical block index in PAGE_SIZE units
 * @offset: byte offset within physical block
 */
static void zram_bio_discard(struct zram *zram, u32 index,
			     int offset, struct bio *bio)
{
	size_t n = bio->bi_iter.bi_size;
675
	struct zram_meta *meta = zram->meta;
J
Joonsoo Kim 已提交
676 677 678 679 680 681 682 683 684 685 686 687

	/*
	 * zram manages data in physical block size units. Because logical block
	 * size isn't identical with physical block size on some arch, we
	 * could get a discard request pointing to a specific offset within a
	 * certain physical block.  Although we can handle this request by
	 * reading that physiclal block and decompressing and partially zeroing
	 * and re-compressing and then re-storing it, this isn't reasonable
	 * because our intent with a discard request is to save memory.  So
	 * skipping this logical block is appropriate here.
	 */
	if (offset) {
688
		if (n <= (PAGE_SIZE - offset))
J
Joonsoo Kim 已提交
689 690
			return;

691
		n -= (PAGE_SIZE - offset);
J
Joonsoo Kim 已提交
692 693 694 695
		index++;
	}

	while (n >= PAGE_SIZE) {
696
		bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
J
Joonsoo Kim 已提交
697
		zram_free_page(zram, index);
698
		bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
699
		atomic64_inc(&zram->stats.notify_free);
J
Joonsoo Kim 已提交
700 701 702 703 704
		index++;
		n -= PAGE_SIZE;
	}
}

M
Minchan Kim 已提交
705
static void zram_reset_device(struct zram *zram, bool reset_capacity)
706
{
707 708 709
	size_t index;
	struct zram_meta *meta;

710
	down_write(&zram->init_lock);
M
Minchan Kim 已提交
711 712 713

	zram->limit_pages = 0;

714
	if (!init_done(zram)) {
715
		up_write(&zram->init_lock);
716
		return;
717
	}
718 719 720 721 722 723 724 725 726 727 728

	meta = zram->meta;
	/* Free all pages that are still in this zram device */
	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
		unsigned long handle = meta->table[index].handle;
		if (!handle)
			continue;

		zs_free(meta->mem_pool, handle);
	}

729
	zcomp_destroy(zram->comp);
730 731
	zram->max_comp_streams = 1;

732 733 734 735 736 737
	zram_meta_free(zram->meta);
	zram->meta = NULL;
	/* Reset stats */
	memset(&zram->stats, 0, sizeof(zram->stats));

	zram->disksize = 0;
738
	if (reset_capacity)
M
Minchan Kim 已提交
739
		set_capacity(zram->disk, 0);
740

741
	up_write(&zram->init_lock);
742 743 744 745 746 747 748 749

	/*
	 * Revalidate disk out of the init_lock to avoid lockdep splat.
	 * It's okay because disk's capacity is protected by init_lock
	 * so that revalidate_disk always sees up-to-date capacity.
	 */
	if (reset_capacity)
		revalidate_disk(zram->disk);
750 751 752 753 754 755
}

static ssize_t disksize_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	u64 disksize;
756
	struct zcomp *comp;
757 758
	struct zram_meta *meta;
	struct zram *zram = dev_to_zram(dev);
759
	int err;
760 761 762 763 764 765 766

	disksize = memparse(buf, NULL);
	if (!disksize)
		return -EINVAL;

	disksize = PAGE_ALIGN(disksize);
	meta = zram_meta_alloc(disksize);
767 768
	if (!meta)
		return -ENOMEM;
769

770
	comp = zcomp_create(zram->compressor, zram->max_comp_streams);
771
	if (IS_ERR(comp)) {
772 773
		pr_info("Cannot initialise %s compressing backend\n",
				zram->compressor);
774 775
		err = PTR_ERR(comp);
		goto out_free_meta;
776 777
	}

778
	down_write(&zram->init_lock);
779
	if (init_done(zram)) {
780
		pr_info("Cannot change disksize for initialized device\n");
781
		err = -EBUSY;
782
		goto out_destroy_comp;
783 784
	}

785
	zram->meta = meta;
786
	zram->comp = comp;
787 788 789
	zram->disksize = disksize;
	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
	up_write(&zram->init_lock);
790 791 792 793 794 795 796 797

	/*
	 * Revalidate disk out of the init_lock to avoid lockdep splat.
	 * It's okay because disk's capacity is protected by init_lock
	 * so that revalidate_disk always sees up-to-date capacity.
	 */
	revalidate_disk(zram->disk);

798
	return len;
799

800 801 802 803
out_destroy_comp:
	up_write(&zram->init_lock);
	zcomp_destroy(comp);
out_free_meta:
804 805
	zram_meta_free(meta);
	return err;
806 807 808 809 810 811 812 813 814 815 816 817 818
}

static ssize_t reset_store(struct device *dev,
		struct device_attribute *attr, const char *buf, size_t len)
{
	int ret;
	unsigned short do_reset;
	struct zram *zram;
	struct block_device *bdev;

	zram = dev_to_zram(dev);
	bdev = bdget_disk(zram->disk, 0);

819 820 821
	if (!bdev)
		return -ENOMEM;

822
	/* Do not reset an active device! */
823 824 825 826
	if (bdev->bd_holders) {
		ret = -EBUSY;
		goto out;
	}
827 828 829

	ret = kstrtou16(buf, 10, &do_reset);
	if (ret)
830
		goto out;
831

832 833 834 835
	if (!do_reset) {
		ret = -EINVAL;
		goto out;
	}
836 837

	/* Make sure all pending I/O is finished */
838
	fsync_bdev(bdev);
839
	bdput(bdev);
840

M
Minchan Kim 已提交
841
	zram_reset_device(zram, true);
842
	return len;
843 844 845 846

out:
	bdput(bdev);
	return ret;
847 848
}

849
static void __zram_make_request(struct zram *zram, struct bio *bio)
850
{
851
	int offset, rw;
852
	u32 index;
853 854
	struct bio_vec bvec;
	struct bvec_iter iter;
855

856 857 858
	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
	offset = (bio->bi_iter.bi_sector &
		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
859

J
Joonsoo Kim 已提交
860 861 862 863 864 865
	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
		zram_bio_discard(zram, index, offset, bio);
		bio_endio(bio, 0);
		return;
	}

866
	rw = bio_data_dir(bio);
867
	bio_for_each_segment(bvec, bio, iter) {
868 869
		int max_transfer_size = PAGE_SIZE - offset;

870
		if (bvec.bv_len > max_transfer_size) {
871 872 873 874 875 876
			/*
			 * zram_bvec_rw() can only make operation on a single
			 * zram page. Split the bio vector.
			 */
			struct bio_vec bv;

877
			bv.bv_page = bvec.bv_page;
878
			bv.bv_len = max_transfer_size;
879
			bv.bv_offset = bvec.bv_offset;
880

881
			if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
882 883
				goto out;

884
			bv.bv_len = bvec.bv_len - max_transfer_size;
885
			bv.bv_offset += max_transfer_size;
886
			if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
887 888
				goto out;
		} else
889
			if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
890 891
				goto out;

892
		update_position(&index, &offset, &bvec);
893
	}
894 895 896

	set_bit(BIO_UPTODATE, &bio->bi_flags);
	bio_endio(bio, 0);
897
	return;
898 899 900 901 902 903

out:
	bio_io_error(bio);
}

/*
904
 * Handler function for all zram I/O requests.
905
 */
906
static void zram_make_request(struct request_queue *queue, struct bio *bio)
907
{
908
	struct zram *zram = queue->queuedata;
909

910
	down_read(&zram->init_lock);
911
	if (unlikely(!init_done(zram)))
912
		goto error;
913

914 915
	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
					bio->bi_iter.bi_size)) {
916
		atomic64_inc(&zram->stats.invalid_io);
917
		goto error;
918 919
	}

920
	__zram_make_request(zram, bio);
921
	up_read(&zram->init_lock);
922

923
	return;
924 925

error:
926
	up_read(&zram->init_lock);
927
	bio_io_error(bio);
928 929
}

N
Nitin Gupta 已提交
930 931
static void zram_slot_free_notify(struct block_device *bdev,
				unsigned long index)
932
{
933
	struct zram *zram;
934
	struct zram_meta *meta;
935

936
	zram = bdev->bd_disk->private_data;
937
	meta = zram->meta;
938

939
	bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
940
	zram_free_page(zram, index);
941
	bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
942
	atomic64_inc(&zram->stats.notify_free);
943 944
}

945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
static int zram_rw_page(struct block_device *bdev, sector_t sector,
		       struct page *page, int rw)
{
	int offset, err;
	u32 index;
	struct zram *zram;
	struct bio_vec bv;

	zram = bdev->bd_disk->private_data;
	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
		atomic64_inc(&zram->stats.invalid_io);
		return -EINVAL;
	}

	down_read(&zram->init_lock);
	if (unlikely(!init_done(zram))) {
		err = -EIO;
		goto out_unlock;
	}

	index = sector >> SECTORS_PER_PAGE_SHIFT;
	offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;

	bv.bv_page = page;
	bv.bv_len = PAGE_SIZE;
	bv.bv_offset = 0;

	err = zram_bvec_rw(zram, &bv, index, offset, rw);
out_unlock:
	up_read(&zram->init_lock);
	/*
	 * If I/O fails, just return error(ie, non-zero) without
	 * calling page_endio.
	 * It causes resubmit the I/O with bio request by upper functions
	 * of rw_page(e.g., swap_readpage, __swap_writepage) and
	 * bio->bi_end_io does things to handle the error
	 * (e.g., SetPageError, set_page_dirty and extra works).
	 */
	if (err == 0)
		page_endio(page, rw, 0);
	return err;
}

988 989
static const struct block_device_operations zram_devops = {
	.swap_slot_free_notify = zram_slot_free_notify,
990
	.rw_page = zram_rw_page,
991
	.owner = THIS_MODULE
992 993
};

994 995 996 997 998 999 1000 1001 1002
static DEVICE_ATTR_RW(disksize);
static DEVICE_ATTR_RO(initstate);
static DEVICE_ATTR_WO(reset);
static DEVICE_ATTR_RO(orig_data_size);
static DEVICE_ATTR_RO(mem_used_total);
static DEVICE_ATTR_RW(mem_limit);
static DEVICE_ATTR_RW(mem_used_max);
static DEVICE_ATTR_RW(max_comp_streams);
static DEVICE_ATTR_RW(comp_algorithm);
1003

1004 1005
ZRAM_ATTR_RO(num_reads);
ZRAM_ATTR_RO(num_writes);
1006 1007
ZRAM_ATTR_RO(failed_reads);
ZRAM_ATTR_RO(failed_writes);
1008 1009 1010 1011 1012
ZRAM_ATTR_RO(invalid_io);
ZRAM_ATTR_RO(notify_free);
ZRAM_ATTR_RO(zero_pages);
ZRAM_ATTR_RO(compr_data_size);

1013 1014 1015 1016 1017 1018
static struct attribute *zram_disk_attrs[] = {
	&dev_attr_disksize.attr,
	&dev_attr_initstate.attr,
	&dev_attr_reset.attr,
	&dev_attr_num_reads.attr,
	&dev_attr_num_writes.attr,
1019 1020
	&dev_attr_failed_reads.attr,
	&dev_attr_failed_writes.attr,
1021 1022 1023 1024 1025 1026
	&dev_attr_invalid_io.attr,
	&dev_attr_notify_free.attr,
	&dev_attr_zero_pages.attr,
	&dev_attr_orig_data_size.attr,
	&dev_attr_compr_data_size.attr,
	&dev_attr_mem_used_total.attr,
M
Minchan Kim 已提交
1027
	&dev_attr_mem_limit.attr,
M
Minchan Kim 已提交
1028
	&dev_attr_mem_used_max.attr,
1029
	&dev_attr_max_comp_streams.attr,
1030
	&dev_attr_comp_algorithm.attr,
1031 1032 1033 1034 1035 1036 1037
	NULL,
};

static struct attribute_group zram_disk_attr_group = {
	.attrs = zram_disk_attrs,
};

1038
static int create_device(struct zram *zram, int device_id)
1039
{
1040
	int ret = -ENOMEM;
1041

1042
	init_rwsem(&zram->init_lock);
1043

1044 1045
	zram->queue = blk_alloc_queue(GFP_KERNEL);
	if (!zram->queue) {
1046 1047
		pr_err("Error allocating disk queue for device %d\n",
			device_id);
1048
		goto out;
1049 1050
	}

1051 1052
	blk_queue_make_request(zram->queue, zram_make_request);
	zram->queue->queuedata = zram;
1053 1054

	 /* gendisk structure */
1055 1056
	zram->disk = alloc_disk(1);
	if (!zram->disk) {
1057
		pr_warn("Error allocating disk structure for device %d\n",
1058
			device_id);
1059
		goto out_free_queue;
1060 1061
	}

1062 1063 1064 1065 1066 1067
	zram->disk->major = zram_major;
	zram->disk->first_minor = device_id;
	zram->disk->fops = &zram_devops;
	zram->disk->queue = zram->queue;
	zram->disk->private_data = zram;
	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
1068

1069
	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
1070
	set_capacity(zram->disk, 0);
1071 1072
	/* zram devices sort of resembles non-rotational disks */
	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
1073
	queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
1074 1075 1076 1077
	/*
	 * To ensure that we always get PAGE_SIZE aligned
	 * and n*PAGE_SIZED sized I/O requests.
	 */
1078
	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
1079 1080
	blk_queue_logical_block_size(zram->disk->queue,
					ZRAM_LOGICAL_BLOCK_SIZE);
1081 1082
	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
J
Joonsoo Kim 已提交
1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097
	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
	zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
	/*
	 * zram_bio_discard() will clear all logical blocks if logical block
	 * size is identical with physical block size(PAGE_SIZE). But if it is
	 * different, we will skip discarding some parts of logical blocks in
	 * the part of the request range which isn't aligned to physical block
	 * size.  So we can't ensure that all discarded logical blocks are
	 * zeroed.
	 */
	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
		zram->disk->queue->limits.discard_zeroes_data = 1;
	else
		zram->disk->queue->limits.discard_zeroes_data = 0;
	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
1098

1099
	add_disk(zram->disk);
1100

1101 1102 1103
	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
				&zram_disk_attr_group);
	if (ret < 0) {
1104
		pr_warn("Error creating sysfs group");
1105
		goto out_free_disk;
1106
	}
1107
	strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
1108
	zram->meta = NULL;
1109
	zram->max_comp_streams = 1;
1110
	return 0;
1111

1112 1113 1114 1115 1116
out_free_disk:
	del_gendisk(zram->disk);
	put_disk(zram->disk);
out_free_queue:
	blk_cleanup_queue(zram->queue);
1117 1118
out:
	return ret;
1119 1120
}

1121
static void destroy_device(struct zram *zram)
1122
{
1123 1124 1125
	sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
			&zram_disk_attr_group);

1126 1127
	del_gendisk(zram->disk);
	put_disk(zram->disk);
1128

1129
	blk_cleanup_queue(zram->queue);
1130 1131
}

1132
static int __init zram_init(void)
1133
{
1134
	int ret, dev_id;
1135

1136
	if (num_devices > max_num_devices) {
1137
		pr_warn("Invalid value for num_devices: %u\n",
1138
				num_devices);
1139 1140
		ret = -EINVAL;
		goto out;
1141 1142
	}

1143 1144
	zram_major = register_blkdev(0, "zram");
	if (zram_major <= 0) {
1145
		pr_warn("Unable to get major number\n");
1146 1147
		ret = -EBUSY;
		goto out;
1148 1149 1150
	}

	/* Allocate the device array and initialize each one */
1151
	zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
1152
	if (!zram_devices) {
1153 1154 1155
		ret = -ENOMEM;
		goto unregister;
	}
1156

1157
	for (dev_id = 0; dev_id < num_devices; dev_id++) {
1158
		ret = create_device(&zram_devices[dev_id], dev_id);
1159
		if (ret)
1160
			goto free_devices;
1161 1162
	}

1163 1164
	pr_info("Created %u device(s) ...\n", num_devices);

1165
	return 0;
1166

1167
free_devices:
1168
	while (dev_id)
1169 1170
		destroy_device(&zram_devices[--dev_id]);
	kfree(zram_devices);
1171
unregister:
1172
	unregister_blkdev(zram_major, "zram");
1173
out:
1174 1175 1176
	return ret;
}

1177
static void __exit zram_exit(void)
1178 1179
{
	int i;
1180
	struct zram *zram;
1181

1182
	for (i = 0; i < num_devices; i++) {
1183
		zram = &zram_devices[i];
1184

1185
		destroy_device(zram);
M
Minchan Kim 已提交
1186 1187 1188 1189 1190
		/*
		 * Shouldn't access zram->disk after destroy_device
		 * because destroy_device already released zram->disk.
		 */
		zram_reset_device(zram, false);
1191 1192
	}

1193
	unregister_blkdev(zram_major, "zram");
1194

1195
	kfree(zram_devices);
1196 1197 1198
	pr_debug("Cleanup done!\n");
}

1199 1200
module_init(zram_init);
module_exit(zram_exit);
1201

1202 1203 1204
module_param(num_devices, uint, 0);
MODULE_PARM_DESC(num_devices, "Number of zram devices");

1205 1206
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
1207
MODULE_DESCRIPTION("Compressed RAM Block Device");