diff --git a/fs/eulerfs/super.c b/fs/eulerfs/super.c new file mode 100644 index 0000000000000000000000000000000000000000..43fc717002d747639320dce4c7fa17c4b0798a58 --- /dev/null +++ b/fs/eulerfs/super.c @@ -0,0 +1,811 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "euler.h" +#include "dht.h" +#include "dep.h" +#include "nvalloc.h" +#include "wear.h" + +int support_clwb; +int support_clflushopt; +int support_clflush; +int force_nocache_write; +int persist_period = -4; +int persisters_per_socket = 1; +int max_dirty_inodes = 1000000; +int max_dep_nodes = 1000000; +int wear_control; +int wear_threshold = 100000; +int wear_alloc_threshold = 10000; + +module_param(persisters_per_socket, int, 0444); +MODULE_PARM_DESC(persisters_per_socket, "Num of Persisters per socket"); +module_param(force_nocache_write, int, 0444); +MODULE_PARM_DESC(force_nocache_write, "Force to use nocache data write"); +module_param(persist_period, int, 0444); +MODULE_PARM_DESC(persist_period, "Period to wake persisters up"); +module_param(max_dirty_inodes, int, 0444); +MODULE_PARM_DESC(max_dirty_inodes, + "Limit the max number of dirty inodes allowed"); +module_param(max_dep_nodes, int, 0444); +MODULE_PARM_DESC(max_dep_nodes, "Limit the max number of dep nodes allowed"); +module_param(wear_control, int, 0444); +MODULE_PARM_DESC(wear_control, "Control wear leveling"); +module_param(wear_threshold, int, 0444); +MODULE_PARM_DESC(wear_threshold, "Wear leveling threshold"); +module_param(wear_alloc_threshold, int, 0444); +MODULE_PARM_DESC(wear_alloc_threshold, + "Wear leveling threshold for allocation"); + +int num_sockets; + +static struct super_operations eufs_sops; + +void eufs_error_mng(struct super_block *sb, const char *fmt, ...) +{ + va_list args; + + eufs_info("euler error: "); + va_start(args, fmt); + vprintk(fmt, args); + va_end(args); + + pr_crit("euler err: remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; +} + +static void eufs_show_params(void) +{ + eufs_info("params: force_nocache_write=%d\n", force_nocache_write); + eufs_info("params: persist_period=%d\n", persist_period); + eufs_info("params: persisters_per_socket=%d\n", persisters_per_socket); +} + +static void eufs_detect_features(void) +{ + support_clwb = support_clflushopt = support_clflush = 0; + if (arch_has_clwb()) { + eufs_info("arch has CLWB support\n"); + support_clwb = 1; + } + + if (arch_has_clflushopt()) { + eufs_info("arch has CLFLUSHOPT support\n"); + support_clflushopt = 1; + } + + if (arch_has_clflush()) { + eufs_info("arch has CLFLUSH support\n"); + support_clflush = 1; + } + + if (!support_clwb && !support_clflushopt && !support_clflush) + eufs_info("arch has no cache flush support\n"); +} + +static int eufs_get_block_info(struct super_block *sb, struct eufs_sb_info *sbi) +{ + void *virt_addr = NULL; + pfn_t pfn; + long size; + struct dax_device *dax_dev; + int srcu_id; + + if (!bdev_dax_supported(sb->s_bdev, PAGE_SIZE)) { + eufs_err(sb, "device does not support DAX\n"); + return -EINVAL; + } + + dax_dev = dax_get_by_host(sb->s_bdev->bd_disk->disk_name); + if (!dax_dev) { + eufs_err(sb, "device does not support DAX\n"); + return -EINVAL; + } + + srcu_id = dax_read_lock(); + size = dax_direct_access( + dax_dev, 0, i_size_read(sb->s_bdev->bd_inode) >> PAGE_SHIFT, + &virt_addr, &pfn); + dax_read_unlock(srcu_id); + if (size < 0) { + fs_put_dax(dax_dev); + eufs_err(sb, "device DAX error %ld\n", size); + return size; + } + + sbi->s_dax_dev = dax_dev; + sbi->s_bdev = sb->s_bdev; + sbi->virt_addr = virt_addr; + sbi->phys_addr = pfn_t_to_pfn(pfn) << PAGE_SHIFT; + sbi->initsize = (u64)size << PAGE_SHIFT; + + eufs_info("dev %s virt_addr %px phys_addr %llx size %ld\n", + sb->s_bdev->bd_disk->disk_name, sbi->virt_addr, + sbi->phys_addr, sbi->initsize); + + return 0; +} + +enum { + Opt_init, + Opt_dax, + Opt_err +}; + +static const match_table_t tokens = { + { Opt_init, "init" }, + { Opt_dax, "dax" }, /* DAX is always on. This is for compatibility. */ + { Opt_err, NULL }, +}; + +static int eufs_parse_options(char *options, struct eufs_sb_info *sbi, + bool remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 0; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_init: + if (remount) + goto bad_opt; + set_opt(sbi->s_mount_opt, FORMAT); + break; + case Opt_dax: + break; + default: + goto bad_opt; + } + } + + return 0; + +bad_opt: + eufs_info("Bad mount option: \"%s\"\n", p); + return -EINVAL; +} + +static bool eufs_check_size(struct super_block *sb, unsigned long size) +{ + unsigned long minimum_size; + + /* For Super Block */ + minimum_size = 2 << sb->s_blocksize_bits; + /* For Bitmaps */ + minimum_size += size / EUFS_BLOCK_SIZE / 8; + + if (size < minimum_size) + return false; + + return true; +} + +static __always_inline int eufs_check_super(struct eufs_super_block *ps, + const char *typ) +{ + u16 save_crc = 0; + u16 calc_crc = 0; + struct eufs_super_block scratch; + + memcpy(&scratch, ps, sizeof(*ps)); + save_crc = scratch.s_sum; + scratch.s_sum = 0; + scratch.s_safe_umount = 0; + calc_crc = crc16(~0, (__u8 *)&scratch, sizeof(scratch)); + if (save_crc != calc_crc) { + eufs_warn("Recognizing %s super block failed: crc %x mismatch (%x expected)", + typ, calc_crc, save_crc); + return -EIO; + } + if (scratch.s_magic != EUFS_SUPER_MAGIC) { + eufs_warn("Recognizing %s super block failed: magic %x mismatch (%x expected)", + typ, scratch.s_magic, EUFS_SUPER_MAGIC); + return -EIO; + } + return 0; +} + +static __always_inline int eufs_recognize_fs(struct super_block *sb) +{ + struct eufs_super_block *super; + struct eufs_super_block *super2; + int err; + + super = eufs_get_super(sb); + super2 = (void *)super + EUFS_SB2_OFFSET; + err = eufs_check_super(super, "primary"); + if (err) { + err = eufs_check_super(super2, "secondary"); + if (err) + return -EIO; + + eufs_info("Secondary super block recognized, syncing back to the primary.\n"); + memcpy(super, super2, sizeof(struct eufs_super_block)); + eufs_flush_buffer(super2, sizeof(*super2), false); + eufs_pbarrier(); + } + return 0; +} + +static __always_inline void eufs_sync_super(struct eufs_super_block *ps) +{ + u16 crc = 0; + __le32 saved_safe_umount = ps->s_safe_umount; + + ps->s_safe_umount = 0; + ps->s_wtime = cpu_to_le32(get_seconds()); + ps->s_sum = 0; + crc = crc16(~0, (__u8 *)ps, sizeof(struct eufs_super_block)); + ps->s_sum = cpu_to_le16(crc); + + eufs_flush_buffer(ps, sizeof(*ps), false); + eufs_pbarrier(); + + /* Keep sync redundant super block */ + memcpy((void *)ps + EUFS_SB2_OFFSET, (void *)ps, + sizeof(struct eufs_super_block)); + eufs_flush_buffer((void *)ps + EUFS_SB2_OFFSET, sizeof(*ps), false); + eufs_pbarrier(); + ps->s_safe_umount = saved_safe_umount; +} + +static struct eufs_inode *eufs_init(struct super_block *sb, unsigned long size) +{ + struct eufs_inode __pmem *root_i; + struct eufs_super_block __pmem *super; + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct nv_dict *dict; + + eufs_info("creating an empty eulerfs of size %lu\n", size); + + sbi->block_start = 0; + sbi->block_end = ((unsigned long)(size) >> PAGE_SHIFT); + + if (!sbi->virt_addr) { + eufs_err(sb, "mapping eulerfs image failed\n"); + return ERR_PTR(-EINVAL); + } + + sb->s_blocksize_bits = EUFS_BLOCK_SIZE_BITS; + sbi->blocksize = EUFS_BLOCK_SIZE; + + if (!eufs_check_size(sb, size)) { + eufs_err(sb, "Specified size too small 0x%lx for EulerFS\n", + size); + return ERR_PTR(-EINVAL); + } + + super = eufs_get_super(sb); + + super->s_sum = 0; + super->s_magic = cpu_to_le16(EUFS_SUPER_MAGIC); + super->s_safe_umount = 0; + super->s_flag = 0; + super->s_fs_version = cpu_to_le16(1); + super->s_size = cpu_to_le64(size); + super->s_virt_addr = cpu_to_le64(sbi->virt_addr); + + sbi->s_crash_ver = 1; + super->s_crash_ver = cpu_to_le64(1); + + nv_init(sb, true); + super->s_page_map = cpu_to_le64(p2o(sb, sbi->page_map)); + super->s_mtime = 0; + + root_i = eufs_malloc_pinode(sb); + if (!root_i) + return ERR_PTR(-ENOSPC); + + eufs_info("root_i: %px\n", root_i); + eufs_alloc_persist(sb, root_i, false); + + super->s_root_pi = p2s(sb, root_i); + eufs_sync_super(super); + + /* ================ init root dir =============== */ + eufs_iwrite_flags(root_i, 0); + eufs_iwrite_mode(root_i, S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR); + eufs_iwrite_version(root_i, 1); + eufs_iwrite_ctime(root_i, get_seconds()); + eufs_iwrite_ctime_nsec(root_i, 0); + eufs_iwrite_uid(root_i, from_kuid(&init_user_ns, current_fsuid())); + eufs_iwrite_gid(root_i, from_kgid(&init_user_ns, current_fsgid())); + eufs_iwrite_dotdot(root_i, p2o(sb, root_i)); + eufs_iwrite_ext(root_i, 0); /* no ext here */ + eufs_iwrite_generation(root_i, 0); + eufs_iwrite_nlink(root_i, 2); + eufs_iwrite_mtime(root_i, get_seconds()); + eufs_iwrite_atime(root_i, get_seconds()); + eufs_iwrite_mtime_nsec(root_i, 0); + eufs_iwrite_atime_nsec(root_i, 0); + dict = eufs_zalloc_htable(sb); + if (!dict) + return ERR_PTR(-ENOSPC); + eufs_alloc_persist(sb, dict, false); + eufs_flush_range(dict, sizeof(struct nv_dict)); + + eufs_iwrite_dict(root_i, p2o(sb, dict)); + eufs_iwrite_size(root_i, 0); + + root_i->i_fresh = 2; + eufs_flush_cacheline(root_i); + eufs_flush_cacheline(&root_i->i_fresh); + EUFS_TWIN_PI(root_i)->i_fresh = 1; + eufs_flush_cacheline(&EUFS_TWIN_PI(root_i)->i_fresh); + + eufs_pbarrier(); + return root_i; +} + +static void eufs_destroy_super(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + wear_fini(sb); + + dep_fini(sb); + + nv_fini(sb); + + if (sbi->virt_addr) + sbi->virt_addr = NULL; + if (sbi->s_dax_dev) + fs_put_dax(sbi->s_dax_dev); + + sb->s_fs_info = NULL; + + kfree(sbi); +} + +static int eufs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct eufs_super_block __pmem *super; + struct eufs_inode __pmem *root_pi; + struct eufs_sb_info *sbi = NULL; + struct inode *root_i = NULL; + u32 random = 0; + int err; + + BUILD_BUG_ON(sizeof(struct eufs_super_block) > EUFS_SB_SIZE); + BUILD_BUG_ON(sizeof(struct eufs_inode) != 2 * CACHELINE_SIZE); + BUILD_BUG_ON(sizeof(struct nv_dict_entry) != CACHELINE_SIZE); + + eufs_detect_features(); + + sbi = kzalloc(sizeof(struct eufs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + + sbi->s_draining = false; + init_waitqueue_head(&sbi->s_draining_wq); + atomic_set(&sbi->s_nr_dirty_inodes, 0); + atomic_set(&sbi->s_nr_dep_nodes, 0); + + sb->s_fs_info = sbi; + + err = eufs_get_block_info(sb, sbi); + if (err) + goto out; + + get_random_bytes(&random, sizeof(u32)); + atomic_set(&sbi->next_generation, random); + + mutex_init(&sbi->s_lock); + mutex_init(&sbi->gather_mutex); + mutex_init(&sbi->sync_mutex); + + err = eufs_parse_options(data, sbi, 0); + if (err) + goto out; + + super = eufs_get_super(sb); + + /* Init a new EulerFS instance */ + if (test_opt(sb, FORMAT)) { + root_pi = eufs_init(sb, sbi->initsize); + if (IS_ERR(root_pi)) { + err = PTR_ERR(root_pi); + goto out; + } + + goto setup_sb; + } + + err = eufs_recognize_fs(sb); + if (err) { + eufs_crit("No valid EulerFS found. Are you trying to mount a wrong fs?\n"); + goto out; + } + + sbi->block_start = 0; + sbi->block_end = ((unsigned long)(super->s_size) >> PAGE_SHIFT); + sb->s_blocksize_bits = EUFS_BLOCK_SIZE_BITS; + sbi->blocksize = EUFS_BLOCK_SIZE; + + sbi->page_map = (void *)o2p(sb, super->s_page_map); + sbi->initsize = (u64)super->s_size; + eufs_get_layout(sb, false); + + sbi->s_crash_ver = le64_to_cpu(super->s_crash_ver); + + if (!super->s_safe_umount) { + super->s_crash_ver = cpu_to_le64(++sbi->s_crash_ver); + eufs_flush_cacheline(&super->s_crash_ver); + eufs_pbarrier(); + } + + nv_init(sb, false); + + root_pi = (struct eufs_inode *)s2p(sb, super->s_root_pi); + +setup_sb: + super->s_safe_umount = 0; + eufs_flush_cacheline(&super->s_safe_umount); + eufs_pbarrier(); + + sbi->s_crash_ver = le64_to_cpu(super->s_crash_ver); + + sb->s_magic = le16_to_cpu(super->s_magic); + sb->s_op = &eufs_sops; + sb->s_maxbytes = EUFS_MAX_FILE_SIZE; + sb->s_time_gran = NSEC_PER_SEC; + + err = dep_init(sb); + if (err) + goto out; + + wear_init(sb); + + root_i = eufs_iget(sb, root_pi); + if (IS_ERR(root_i)) { + err = PTR_ERR(root_i); + goto out; + } + + sb->s_root = d_make_root(root_i); + if (!sb->s_root) { + eufs_err(sb, "alloc root dentry failed\n"); + err = -ENOMEM; + goto out; + } + + if (!(sb->s_flags & MS_RDONLY)) { + u64 mnt_write_time; + /* update mount time and write time atomically. */ + mnt_write_time = (get_seconds() & 0xFFFFFFFF); + mnt_write_time = mnt_write_time | (mnt_write_time << 32); + + super->s_mtime = mnt_write_time; + + eufs_flush_buffer(&super->s_mtime, 8, false); + eufs_pbarrier(); + } + + return 0; + +out: + eufs_destroy_super(sb); + return err; +} + +static int eufs_statfs(struct dentry *d, struct kstatfs *buf) +{ + struct super_block *sb = d->d_sb; + struct eufs_sb_info *sbi = (struct eufs_sb_info *)sb->s_fs_info; + + u64 npage, ncl; + + nv_stat(sbi, &npage, &ncl); + + buf->f_type = EUFS_SUPER_MAGIC; + buf->f_bsize = PAGE_SIZE; + + buf->f_blocks = sbi->block_end; + + buf->f_bfree = npage; + buf->f_bavail = npage; + + buf->f_files = ncl; + buf->f_ffree = ncl; + + buf->f_namelen = EUFS_MAX_NAME_LEN; + print_stats(sbi); + + return 0; +} + +static int eufs_show_options(struct seq_file *seq, struct dentry *root) +{ + seq_puts(seq, ",dax"); + + return 0; +} + +static int eufs_remount(struct super_block *sb, int *mntflags, char *data) +{ + unsigned long old_sb_flags; + unsigned long old_mount_opt; + struct eufs_super_block *ps; + struct eufs_sb_info *sbi = EUFS_SB(sb); + int ret = -EINVAL; + + /* Store the old options */ + mutex_lock(&sbi->s_lock); + old_sb_flags = sb->s_flags; + old_mount_opt = sbi->s_mount_opt; + + if (eufs_parse_options(data, sbi, 1)) + goto restore_opt; + + if ((*mntflags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + u64 mnt_write_time; + + ps = eufs_get_super(sb); + /* update mount time and write time atomically. */ + mnt_write_time = (get_seconds() & 0xFFFFFFFF); + mnt_write_time = mnt_write_time | (mnt_write_time << 32); + + ps->s_mtime = mnt_write_time; + + eufs_flush_buffer(&ps->s_mtime, 8, false); + eufs_pbarrier(); + } + + mutex_unlock(&sbi->s_lock); + ret = 0; + return ret; + +restore_opt: + sb->s_flags = old_sb_flags; + sbi->s_mount_opt = old_mount_opt; + mutex_unlock(&sbi->s_lock); + return ret; +} + +static void eufs_put_super(struct super_block *sb) +{ + struct eufs_super_block *super; + + super = eufs_get_super(sb); + + eufs_sync_super(super); + + super->s_safe_umount = 1; + eufs_flush_cacheline(&super->s_safe_umount); + eufs_pbarrier(); + + eufs_info("safe unmount.\n"); + eufs_destroy_super(sb); +} + +static struct inode *eufs_alloc_inode(struct super_block *sb) +{ + struct eufs_inode_info *vi; + + vi = eufs_alloc_vi(); + if (!vi) + return NULL; + + INIT_LIST_HEAD(&vi->i_dep_list); + + vi->i_next_dep_seq = 1; + vi->i_persisted_dep_seq = 0; + + spin_lock_init(&vi->i_owner_lock); + INIT_LIST_HEAD(&vi->i_owner_list); + + vi->i_lock_transferred = I_TRANS_NONE; + vi->i_is_persisting = false; + vi->i_is_dirty = false; + + vi->i_volatile_root = NULL; + vi->i_volatile_height = 0; + + vi->i_dotdot = 0; + + atomic64_set(&vi->vfs_inode.i_version, 1); + + vi->page_batch.size = 0; + vi->page_batch.n_used = -1; + vi->page_batch.batch = NULL; + INIT_LIST_HEAD(&vi->page_batch.list); + + vi->i_volatile_dict = NULL; + + mutex_init(&vi->i_urgent_mutex); + mutex_init(&vi->i_dep_lock); + mutex_init(&vi->i_header_lock); + + init_rwsem(&vi->mmap_rwsem); + spin_lock_init(&vi->i_dentry_persist_lock); + mutex_init(&vi->i_leaf_lock); + + return &vi->vfs_inode; +} + +static void eufs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + + eufs_alloc_batch_fini(&EUFS_I(inode)->page_batch); + eufs_free_vi(EUFS_I(inode)); +} + +static void eufs_destroy_inode(struct inode *inode) +{ + if (EUFS_I(inode)->i_volatile_dict) { + eufs_free_page(EUFS_I(inode)->i_volatile_dict); + EUFS_I(inode)->i_volatile_dict = NULL; + } + call_rcu(&inode->i_rcu, eufs_i_callback); +} + +static int eufs_sync_fs(struct super_block *sb, int sync) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + int i; + int num_persisters = num_sockets * persisters_per_socket; + int wait_flag; + + if (!sync) + return 0; + + mutex_lock(&sbi->sync_mutex); + + for (i = 0; i < num_persisters; i++) + sbi->need_sync[i] = true; + + /* FIXME: Persisters may miss the wake-up message. */ + for (i = 0; i < num_persisters; ++i) + wake_up_process(sbi->persisters[i]); + + do { + wait_flag = false; + for (i = 0; i < num_persisters; i++) { + if (sbi->need_sync[i] == false) + continue; + wait_flag = true; + wait_event_interruptible(sbi->sync_wq, + (sbi->need_sync[i] == false)); + } + } while (wait_flag); + + mutex_unlock(&sbi->sync_mutex); + + return 0; +} + +/* + * the super block writes are all done "on the fly", so the + * super block is never in a "dirty" state, so there's no need + * for write_super. + */ +static struct super_operations eufs_sops = { + .alloc_inode = eufs_alloc_inode, + .destroy_inode = eufs_destroy_inode, + .write_inode = eufs_write_inode, + .evict_inode = eufs_evict_inode, + .put_super = eufs_put_super, + .statfs = eufs_statfs, + .remount_fs = eufs_remount, + .show_options = eufs_show_options, + .sync_fs = eufs_sync_fs, +}; + +static struct dentry *eufs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, eufs_fill_super); +} + +static struct file_system_type eufs_fs_type = { + .owner = THIS_MODULE, + .name = "eulerfs", + .mount = eufs_mount, + .kill_sb = kill_block_super, +}; + +static int __init init_eufs_fs(void) +{ + int rc = 0; + int cpu; + + BUILD_BUG_ON(sizeof(struct eufs_renamej) != 2 * CACHELINE_SIZE); + + rc = init_page_cache(); + if (rc) + goto out1; + + rc = init_inodecache(); + if (rc) + goto out2; + + rc = init_dep_node_cache(); + if (rc) + goto out3; + + rc = register_filesystem(&eufs_fs_type); + if (rc) + goto out4; + + num_sockets = 0; + for_each_possible_cpu(cpu) { + int sock = cpu_to_node(cpu); + + if (sock > num_sockets) + num_sockets = sock; + } + num_sockets += 1; + eufs_info("Num socket: %d\n", num_sockets); + + eufs_show_params(); + + return 0; + +out4: + destroy_dep_node_cache(); +out3: + destroy_inodecache(); +out2: + destroy_page_cache(); +out1: + return rc; +} + +static void __exit exit_eufs_fs(void) +{ + unregister_filesystem(&eufs_fs_type); + destroy_inodecache(); + destroy_dep_node_cache(); + destroy_page_cache(); +} + +module_init(init_eufs_fs); +module_exit(exit_eufs_fs); + +MODULE_DESCRIPTION("EulerFS"); +MODULE_LICENSE("GPL");