staging: erofs: introduce VLE decompression support

This patch introduces the basic in-place VLE decompression implementation for the erofs file system. Compared with fixed-sized input compression, it implements what we call 'the variable-length extent compression' which specifies the same output size for each compression block to make the full use of IO bandwidth (which means almost all data from block device can be directly used for decomp- ression), improve the real (rather than just via data caching, which costs more memory) random read and keep the relatively lower compression ratios (it saves more storage space than fixed-sized input compression which is also configured with the same input block size), as illustrated below: |--- variable-length extent ---|------ VLE ------|--- VLE ---| /> clusterofs /> clusterofs /> clusterofs /> clusterofs ++---|-------++-----------++---------|-++-----------++-|---------++-| ...|| | || || | || || | || | ... original data ++---|-------++-----------++---------|-++-----------++-|---------++-| ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++ size size size size size \ / / / \ / / / \ / / / ++-----------++-----------++-----------++ ... || || || || ... compressed clusters ++-----------++-----------++-----------++ ++->cluster<-++->cluster<-++->cluster<-++ size size size The main point of 'in-place' refers to the decompression mode: Instead of allocating independent compressed pages and data structures, it reuses the allocated file cache pages at most to store its compressed data and the corresponding pagevec in a time-sharing approach by default, which will be useful for low memory scenario. In the end, unlike the other filesystems with (de)compression support using a relatively large compression block size, which reads and decompresses >= 128KB at once, and gains a more good-looking random read (In fact it collects small random reads into large sequential reads and caches all decompressed data in memory, but it is unacceptable especially for embedded devices with limited memory, and it is not the real random read), we select a universal small-sized 4KB compressed cluster, which is the smallest page size for most architectures, and all compressed clusters can be read and decompressed independently, which ensures random read number for all use cases. Signed-off-by: N Gao Xiang <gaoxiang25@huawei.com> Signed-off-by: N Greg Kroah-Hartman <gregkh@linuxfoundation.org>

staging: erofs: introduce VLE decompression support
This patch introduces the basic in-place VLE decompression implementation for the erofs file system. Compared with fixed-sized input compression, it implements what we call 'the variable-length extent compression' which specifies the same output size for each compression block to make the full use of IO bandwidth (which means almost all data from block device can be directly used for decomp- ression), improve the real (rather than just via data caching, which costs more memory) random read and keep the relatively lower compression ratios (it saves more storage space than fixed-sized input compression which is also configured with the same input block size), as illustrated below: |--- variable-length extent ---|------ VLE ------|--- VLE ---| /> clusterofs /> clusterofs /> clusterofs /> clusterofs ++---|-------++-----------++---------|-++-----------++-|---------++-| ...|| | || || | || || | || | ... original data ++---|-------++-----------++---------|-++-----------++-|---------++-| ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++ size size size size size \ / / / \ / / / \ / / / ++-----------++-----------++-----------++ ... || || || || ... compressed clusters ++-----------++-----------++-----------++ ++->cluster<-++->cluster<-++->cluster<-++ size size size The main point of 'in-place' refers to the decompression mode: Instead of allocating independent compressed pages and data structures, it reuses the allocated file cache pages at most to store its compressed data and the corresponding pagevec in a time-sharing approach by default, which will be useful for low memory scenario. In the end, unlike the other filesystems with (de)compression support using a relatively large compression block size, which reads and decompresses >= 128KB at once, and gains a more good-looking random read (In fact it collects small random reads into large sequential reads and caches all decompressed data in memory, but it is unacceptable especially for embedded devices with limited memory, and it is not the real random read), we select a universal small-sized 4KB compressed cluster, which is the smallest page size for most architectures, and all compressed clusters can be read and decompressed independently, which ensures random read number for all use cases. Signed-off-by: N Gao Xiang <gaoxiang25@huawei.com> Signed-off-by: N Greg Kroah-Hartman <gregkh@linuxfoundation.org>
3883a79a · Gao Xiang · Greg Kroah-Hartman · e7e9a307 · 3883a79a · 3883a79a
6 changed file
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -210,7 +210,12 @@ static int fill_inode(struct inode *inode, int isdir)
 		}
 		if (is_inode_layout_compression(inode)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+			inode->i_mapping->a_ops =
+				&z_erofs_vle_normalaccess_aops;
+#else
 			err = -ENOTSUPP;
+#endif
 			goto out_unlock;
 		}

--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -262,6 +262,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
 #ifdef CONFIG_EROFS_FS_ZIP
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+/* page count of a compressed cluster */
+#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
 #endif
 typedef u64 erofs_off_t;
@@ -340,6 +343,9 @@ extern const struct inode_operations erofs_dir_iops;
 extern const struct file_operations erofs_dir_fops;
 extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
+#endif
 /*
 * Logical to physical block mapping, used by erofs_map_blocks()

--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -115,6 +115,13 @@ static int superblock_read(struct super_block *sb)
 	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
 #endif
 	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->clusterbits = 12;
+	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
+		errln("clusterbits %u is not supported on this kernel",
+			sbi->clusterbits);
+#endif
 	sbi->root_nid = le16_to_cpu(layout->root_nid);
 	sbi->inos = le64_to_cpu(layout->inos);
@@ -441,6 +448,11 @@ static struct file_system_type erofs_fs_type = {
 };
 MODULE_ALIAS_FS("erofs");
+#ifdef CONFIG_EROFS_FS_ZIP
+extern int z_erofs_init_zip_subsystem(void);
+extern void z_erofs_exit_zip_subsystem(void);
+#endif
 static int __init erofs_module_init(void)
 {
 	int err;
@@ -456,6 +468,12 @@ static int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
+#ifdef CONFIG_EROFS_FS_ZIP
+	err = z_erofs_init_zip_subsystem();
+	if (err)
+		goto zip_err;
+#endif
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
@@ -464,6 +482,10 @@ static int __init erofs_module_init(void)
 	return 0;
 fs_err:
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+zip_err:
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 shrinker_err:
 	erofs_exit_inode_cache();
@@ -474,6 +496,9 @@ static int __init erofs_module_init(void)
 static void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
+#ifdef CONFIG_EROFS_FS_ZIP
+	z_erofs_exit_zip_subsystem();
+#endif
 	unregister_shrinker(&erofs_shrinker_info);
 	erofs_exit_inode_cache();
 	infoln("successfully finalize erofs");

--- a/drivers/staging/erofs/unzip_vle.c
+++ b/drivers/staging/erofs/unzip_vle.c
--- a/drivers/staging/erofs/unzip_vle.h
+++ b/drivers/staging/erofs/unzip_vle.h
@@ -14,9 +14,213 @@
 #define __EROFS_FS_UNZIP_VLE_H
 #include "internal.h"
+#include "unzip_pagevec.h"
+/*
+ *  - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
+ * used for temporary allocated pages (via erofs_allocpage),
+ * in order to seperate those from NULL mapping (eg. truncated pages)
+ */
+#define Z_EROFS_MAPPING_STAGING		((void *)0x5A110C8D)
+#define z_erofs_is_stagingpage(page)	\
+	((page)->mapping == Z_EROFS_MAPPING_STAGING)
+static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool,
+						 struct page *page)
+{
+	if (z_erofs_is_stagingpage(page)) {
+		list_add(&page->lru, page_pool);
+		return true;
+	}
+	return false;
+}
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else.
+ *
+ */
 #define Z_EROFS_VLE_INLINE_PAGEVECS     3
+struct z_erofs_vle_work {
+	/* struct z_erofs_vle_work *left, *right; */
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+	struct list_head list;
+	atomic_t refcount;
+#endif
+	struct mutex lock;
+	/* I: decompression offset in page */
+	unsigned short pageofs;
+	unsigned short nr_pages;
+	/* L: queued pages in pagevec[] */
+	unsigned vcnt;
+	union {
+		/* L: pagevec */
+		erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
+		struct rcu_head rcu;
+	};
+};
+#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN        0
+#define Z_EROFS_VLE_WORKGRP_FMT_LZ4          1
+#define Z_EROFS_VLE_WORKGRP_FMT_MASK         1
+typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
+struct z_erofs_vle_workgroup {
+	struct erofs_workgroup obj;
+	struct z_erofs_vle_work work;
+	/* next owned workgroup */
+	z_erofs_vle_owned_workgrp_t next;
+	/* compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+	unsigned int llen, flags;
+};
+/* let's avoid the valid 32-bit kernel addresses */
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_VLE_WORKGRP_TAIL        ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
+#define Z_EROFS_VLE_WORKGRP_NIL         (NULL)
+#define z_erofs_vle_workgrp_fmt(grp)	\
+	((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
+static inline void z_erofs_vle_set_workgrp_fmt(
+	struct z_erofs_vle_workgroup *grp,
+	unsigned int fmt)
+{
+	grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
+}
+#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
+#error multiref decompression is unimplemented yet
+#else
+#define z_erofs_vle_grab_primary_work(grp)	(&(grp)->work)
+#define z_erofs_vle_grab_work(grp, pageofs)	(&(grp)->work)
+#define z_erofs_vle_work_workgroup(wrk, primary)	\
+	((primary) ? container_of(wrk,	\
+		struct z_erofs_vle_workgroup, work) : \
+		({ BUG(); (void *)NULL; }))
+#endif
+#define Z_EROFS_WORKGROUP_SIZE       sizeof(struct z_erofs_vle_workgroup)
+struct z_erofs_vle_unzip_io {
+	atomic_t pending_bios;
+	z_erofs_vle_owned_workgrp_t head;
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+struct z_erofs_vle_unzip_io_sb {
+	struct z_erofs_vle_unzip_io io;
+	struct super_block *sb;
+};
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+static inline unsigned z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	unsigned long *p, o, v, id;
+repeat:
+	p = &page_private(page);
+	o = READ_ONCE(*p);
+	id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (id) {
+		if (!index)
+			return;
+		BUG_ON(id != index);
+	}
+	v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
+	if (cmpxchg(p, o, v) != o)
+		goto repeat;
+}
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned v;
+	BUG_ON(!PagePrivate(page));
+	u.v = &page_private(page);
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES	\
+	min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
+#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES	2048
 /* unzip_vle_lz4.c */
 extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
 	unsigned clusterpages, struct page **pages,

--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -12,6 +12,7 @@
 */
 #include "internal.h"
+#include <linux/pagevec.h>
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 {
@@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
 	return err;
 }
+extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+	int count = atomic_dec_return(&grp->refcount);
+	if (count == 1)
+		atomic_long_inc(&erofs_global_shrink_cnt);
+	else if (!count) {
+		atomic_long_dec(&erofs_global_shrink_cnt);
+		erofs_workgroup_free_rcu(grp);
+	}
+	return count;
+}
 unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
 				       unsigned long nr_shrink,
 				       bool cleanup)
 {
-	return 0;
+	pgoff_t first_index = 0;
+	void *batch[PAGEVEC_SIZE];
+	unsigned freed = 0;
+	int i, found;
+repeat:
+	erofs_workstn_lock(sbi);
+	found = radix_tree_gang_lookup(&sbi->workstn_tree,
+		batch, first_index, PAGEVEC_SIZE);
+	for (i = 0; i < found; ++i) {
+		int cnt;
+		struct erofs_workgroup *grp = (void *)
+			((unsigned long)batch[i] &
+				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+		first_index = grp->index + 1;
+		cnt = atomic_read(&grp->refcount);
+		BUG_ON(cnt <= 0);
+		if (cleanup)
+			BUG_ON(cnt != 1);
+		else if (cnt > 1)
+			continue;
+		if (radix_tree_delete(&sbi->workstn_tree,
+			grp->index) != grp)
+			continue;
+		/* (rarely) grabbed again when freeing */
+		erofs_workgroup_put(grp);
+		++freed;
+		if (unlikely(!--nr_shrink))
+			break;
+	}
+	erofs_workstn_unlock(sbi);
+	if (i && nr_shrink)
+		goto repeat;
+	return freed;
 }
 #endif