// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 and * only version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ #include #include #include #include #include #include #include #include "euler.h" #include "dep.h" #include "lock.h" #include "dax.h" #include "dht.h" static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep, u64 *bitset); struct flush_list_head { int count; struct llist_head head; }; DEFINE_PER_CPU(struct flush_list_head, flush_list_percpu); #define IFMT_HAS_ROOT(ifmt) \ ((ifmt) == S_IFREG || (ifmt) == S_IFDIR || (ifmt) == S_IFLNK) #define INODE_COND_TRYLOCK(inode, tag, enter_cond, exit_cond, exit_expr) \ do { \ tag: \ if (enter_cond) { \ if (likely(inode_trylock(inode))) { \ /* get the lock, okay */ \ } else { \ if (exit_cond) { \ exit_expr; \ } else { \ cond_resched(); \ goto tag; \ } \ } \ } \ } while (0) static inline void fsync_dir_oneshot(struct inode *dir) { eufs_dir_fsync_oneshot(dir); } static void do_dep_dirrem(struct inode *inode, struct dep_node *dep, u64 *bitset) { struct nv_dict_entry *prevde = dep->prevde; struct nv_dict_entry *de = dep->de; int idx; eufs_dbg("!! %s !!", __func__); NV_ASSERT(de); NV_ASSERT(de->inode); NV_ASSERT(de->name); idx = INDEX(de->hv); bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63)); eufs_dbg("bitset-add: dict=%llx, %d %llx\n", eufs_iread_dict(EUFS_PI(inode)), idx, bitset[idx / 64]); /* * This is a removal of a newly created dentry, nothing to do, * the prevde is already manipulated in dht.c */ if (de->volatile_next == EUFS_DIR_DELNEW) return; /* * If dentries immediately following the deleted dentry are * also deleted, prevde->volatile_next will be modified again. * So if we assign prevde->volatile_next to prevde->next, * these deletion will be persisted prematurely. */ if (prevde && !eufs_dentry_is_not_persist(prevde)) { prevde->next = de->next; persist_dentry(prevde); } } static void do_dep_dirrem_reclaim(struct super_block *sb, struct dep_node *dep) { struct nv_dict_entry *de = dep->de; struct eufs_inode __maybe_unused *pi; struct inode *child; pi = s2p(sb, de->inode); child = dep->inode; NV_ASSERT(EUFS_PI(child) == pi); eufs_dbg("dirrem: child_inode=%px\n", child); BUG_ON(!child); eufs_free_name(sb, de); nv_free(sb, de); } #define EUFS_PRINT_BITSET(lvl, bitset) \ eufs_##lvl("bitsets: %llx %llx %llx %llx %llx %llx %llx %llx\n", \ bitset[0], bitset[1], bitset[2], bitset[3], bitset[4], \ bitset[5], bitset[6], bitset[7]) static void eufs_sync_buckets(struct eufs_inode_info *vi, u64 bitset[8]) { struct inode *inode = &vi->vfs_inode; struct super_block *sb = inode->i_sb; struct eufs_inode *pi = EUFS_FRESH_PI(EUFS_PI(inode)); struct nv_dict *dict; int i; /* Volatile buckets */ if (!vi->i_volatile_dict) return; EUFS_PRINT_BITSET(dbg, bitset); BUG_ON(!inode_is_header_locked(inode)); dict = o2p(sb, eufs_iread_dict(pi)); for (i = 0; i < 8; ++i) { int j; bool dirty; int idx; if (!bitset[i]) continue; dirty = false; for (j = 0; j <= 64; ++j) { if (j % 8 == 0 && dirty) { dirty = false; eufs_flush_cacheline(&dict->table[idx]); } if (j == 64) break; if (!(bitset[i] & (0x1ull << j))) continue; idx = i * 64 + j; eufs_dbg_dir("handle index %d (i %d, j %d) of inode=%px\n", idx, i, j, inode); eufs_dbg_dir(" idx=%d dict[idx]=%px vdict[idx]=%px\n", idx, dict->table[idx], vi->i_volatile_dict->table[idx]); if (unlikely(vi->i_volatile_dict->table[idx] == EUFS_DIR_EOC_PTR)) dict->table[idx] = NULL_VAL; else if (vi->i_volatile_dict->table[idx] != NULL) dict->table[idx] = COMPOSE_DICT_HEAD_le64( sb, vi->i_volatile_dict->table[idx]); vi->i_volatile_dict->table[idx] = NULL; dirty = true; } } } /* * Some ideas on fast fsync (of dir): * * 1. Batch and coalescence. The newly inserted dentry should be marked and * during its removal, it should be marked again so that unnecessary dep_diradd * an be prevented. * * 2. Split! The lock (only when there is one lock needed) can be temporarily * given up so between handling two deps. This requires that the dentry pointed * by dir_pi should not be reclaimed (like in RCU). Well, actually, combined * with the following one idea, this is quite acceptable. * * 3. Delayed free. The removal operations can be delayed until the locks are * released. * * * Parallel fsync for a vi is not throughly considered though. * * 4. Detach only if the list is empty? */ static void fsync_rename_inode(struct inode *dir) { struct eufs_inode_info *vi = EUFS_I(dir); if (!vi->i_is_dirty) return; /* I'm holding the lock, so if it's dirty, it's dirty. */ fsync_dir_oneshot(dir); } void fsync_rename_inodes(struct inode *old_dir, struct inode *new_dir, struct inode **locked_inodes) { int i; struct inode *inode; /* * The two parent dirs, might have parent-child relations sometime * before. So we need to transfer these two dirs too. */ for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) { inode = locked_inodes[i]; if (inode) eufs_inode_mark_lock_transferable(inode); } if (old_dir == new_dir) { fsync_rename_inode(old_dir); } else { fsync_rename_inode(old_dir); fsync_rename_inode(new_dir); } for (i = 0; i < EUFS_INODE_CNT_IN_RENAME; i++) { inode = locked_inodes[i]; if (inode) eufs_inode_wait_lock_transfer_done(inode); } } static void eufs_update_persisted_seq(struct eufs_inode_info *vi, struct list_head *head) { if (!list_empty(head)) { struct dep_node *dep = list_last_entry(head, struct dep_node, node); vi->i_persisted_dep_seq = dep->seq; } } static int fsync_dir_bg(struct inode *dir) { struct dep_node *dep, *next; LIST_HEAD(detached_list); LIST_HEAD(dump_list); int i; #define FSYNC_DIR_VI_LOOP_NUM (20) struct eufs_inode_info *vi = EUFS_I(dir); struct super_block *sb = dir->i_sb; struct eufs_sb_info *sbi = EUFS_SB(sb); struct eufs_inode *pi = EUFS_PI(dir); u64 bitset[8] = { 0 }; int dep_count = 0; retry: inode_urgent_lock(dir); /* Phase 1 */ for (i = FSYNC_DIR_VI_LOOP_NUM; i >= 0; --i) { /* Get all deps round by round */ if (i == 0) { /* Last round */ inode_header_lock(dir); } inode_dep_lock(dir); if (list_empty(&vi->i_dep_list) && i > 0) { /* Skip to last round */ i = 1; } list_cut_position(&detached_list, &vi->i_dep_list, vi->i_dep_list.prev); if (i > 0) inode_dep_unlock(dir); /* Do dep one by one. */ list_for_each_entry_safe(dep, next, &detached_list, node) { if (dep->type == DEP_DIRADD) { /* * FIXME: the lockset might be different since * we might have released the inode lock. */ do_dep_diradd_oneshot(dir, dep, bitset); } else if (dep->type == DEP_DIRREM) { do_dep_dirrem(dir, dep, bitset); } else BUG(); } list_splice_tail_init(&detached_list, &dump_list); if (i == 0) { eufs_pbarrier(); if (!list_empty(&dump_list)) /* Phase 2 */ eufs_sync_buckets(vi, bitset); inode_dep_unlock(dir); inode_header_unlock(dir); break; } } inode_urgent_unlock(dir); /* Phase 3 */ inode_lock(dir); if (!list_empty(&vi->i_dep_list)) { inode_unlock(dir); /* To handle new deps between phase 2 & 3 */ /* FIXME: Live lock possible! */ goto retry; } if (dir->i_nlink) eufs_sync_pinode(dir, pi, false); eufs_update_persisted_seq(vi, &dump_list); vi->i_is_persisting = false; vi->i_is_dirty = false; if (dir->i_nlink) persist_pinode(pi); inode_unlock(dir); eufs_pbarrier(); /* Reclaim memory and clear the list */ list_for_each_entry_safe(dep, next, &dump_list, node) { struct inode *child_inode = dep->inode; struct eufs_inode_info *child_vi = EUFS_I(child_inode); if (dep->type == DEP_DIRREM) do_dep_dirrem_reclaim(sb, dep); /* remove from owner list */ spin_lock(&child_vi->i_owner_lock); list_del_init(&dep->owner_node); spin_unlock(&child_vi->i_owner_lock); iput(child_inode); list_del(&dep->node); eufs_free_dep_node(dep); dep_count++; } atomic_sub(dep_count, &sbi->s_nr_dep_nodes); eufs_dbg("@cpu=%d !! fsync dir vi done: inode=%px\n", smp_processor_id(), &vi->vfs_inode); return 0; } static int fsync_nondir_oneshot(struct inode *inode) { struct eufs_inode_info *vi = EUFS_I(inode); struct eufs_inode *pi; /* For files other than dir */ WARN(S_ISDIR(inode->i_mode), "%s on a dir!", __func__); /* Inode needs to remove. Nothing to do */ if (!inode->i_nlink) { vi->i_is_dirty = false; return 0; } pi = EUFS_PI(inode); eufs_sync_pinode(inode, pi, false); persist_pinode(pi); vi->i_is_dirty = false; return 0; } static int fsync_nondir_bg(struct inode *inode) { struct eufs_inode_info *vi = EUFS_I(inode); int r; inode_lock(inode); r = fsync_nondir_oneshot(inode); vi->i_is_persisting = false; inode_unlock(inode); return r; } static void fsync_bg(struct inode *inode) { struct eufs_sb_info *sbi = EUFS_SB(inode->i_sb); wait_on_inode(inode); /* Reading i_mode may need no protection */ if (S_ISDIR(inode->i_mode)) fsync_dir_bg(inode); else fsync_nondir_bg(inode); /* Decrease */ iput(inode); if (atomic_dec_and_test(&sbi->s_nr_dirty_inodes) && sbi->s_draining) { /* end of draining */ sbi->s_draining = false; } } void fsync_oneshot(struct inode *inode) { /* Reading i_mode may need no protection */ if (S_ISDIR(inode->i_mode)) fsync_dir_oneshot(inode); else fsync_nondir_oneshot(inode); } static void do_dep_diradd_oneshot(struct inode *dir_inode, struct dep_node *dep, u64 *bitset) { struct super_block *sb = dir_inode->i_sb; struct nv_dict_entry *de = dep->de; struct inode *inode = dep->inode; struct eufs_inode_info *dir_vi = EUFS_I(dir_inode); struct eufs_inode *pi; struct eufs_inode *fresh_pi; int idx; void *buffer[16]; struct alloc_batch ab; bool lock_transferred = false; idx = INDEX(de->hv); bitset[idx / 64] = bitset[idx / 64] | (0x1ull << (idx & 63)); if (de->volatile_next == EUFS_DIR_DELNEW) { /* * The de is already invisible from both the latest view and * the consistent view. * Will be handled in the corresponding dirrem. */ return; } /* Meow? This equality is the sign of diradd */ WARN(!eufs_dentry_is_not_persist(de), "diradd wrong sign"); pi = s2p(sb, de->inode); wait_on_inode(inode); retry: if (likely(inode_trylock(inode))) { /* Got the lock */ } else { if (eufs_inode_mark_lock_transferring(inode)) { lock_transferred = true; } else { cond_resched(); goto retry; } } eufs_sync_pinode(inode, pi, false); fresh_pi = EUFS_FRESH_PI(pi); if (!lock_transferred) inode_unlock(inode); else eufs_inode_lock_transfer_done(inode); ab.n_used = 0; ab.size = 16; ab.batch = buffer; eufs_alloc_batch_add(sb, &ab, de); /* * force to persist the allocation without checking. * TODO: we should differentiate the link and create syscall to agree * with checking */ eufs_alloc_persist(sb, pi, true); if (S_ISLNK(fresh_pi->i_mode)) { void *root = o2p(sb, eufs_iread_root(fresh_pi)); /* reg file's root is done in btree */ /* In case of Hard link, we must force the allocation persitence */ eufs_alloc_persist(sb, root, true); persist_symlink(root); } else if (S_ISDIR(fresh_pi->i_mode)) { void *root = o2p(sb, eufs_iread_root(fresh_pi)); eufs_alloc_persist(sb, root, false); persist_page(root); } persist_name(sb, de, &ab); eufs_alloc_batch_persist_reset(sb, &ab); persist_pinode(pi); spin_lock(&dir_vi->i_dentry_persist_lock); eufs_dentry_clr_not_persist_flag(de); spin_unlock(&dir_vi->i_dentry_persist_lock); persist_dentry(de); } void eufs_dir_fsync_oneshot(struct inode *dir) { struct dep_node *dep; struct dep_node *next; struct super_block *sb = dir->i_sb; struct eufs_sb_info *sbi = EUFS_SB(sb); struct eufs_inode_info *vi = EUFS_I(dir); LIST_HEAD(detached_list); u64 bitset[8] = { 0 }; int dep_count = 0; BUG_ON(!inode_is_locked(dir)); inode_urgent_lock(dir); /* get all deps */ inode_header_lock(dir); inode_dep_lock(dir); if (list_empty(&vi->i_dep_list)) goto unlock_sync_pinode; list_for_each_entry(dep, &vi->i_dep_list, node) { if (dep->type == DEP_DIRADD) do_dep_diradd_oneshot(dir, dep, bitset); else if (dep->type == DEP_DIRREM) do_dep_dirrem(dir, dep, bitset); else BUG(); } list_splice_init(&vi->i_dep_list, &detached_list); /* sync buckets */ eufs_pbarrier(); eufs_sync_buckets(vi, bitset); unlock_sync_pinode: inode_dep_unlock(dir); inode_header_unlock(dir); /* sync pinode */ if (dir->i_nlink) eufs_sync_pinode(dir, EUFS_PI(dir), false); eufs_pbarrier(); eufs_update_persisted_seq(vi, &detached_list); vi->i_is_dirty = false; /* Reclaim memory and clear the list */ list_for_each_entry_safe(dep, next, &detached_list, node) { struct inode *child_inode = dep->inode; struct eufs_inode_info *child_vinode = EUFS_I(child_inode); spin_lock(&child_vinode->i_owner_lock); list_del_init(&dep->owner_node); spin_unlock(&child_vinode->i_owner_lock); if (dep->type == DEP_DIRREM) { do_dep_dirrem_reclaim(sb, dep); iput(dep->inode); } else if (dep->type == DEP_DIRADD) { iput(dep->inode); } list_del(&dep->node); eufs_free_dep_node(dep); dep_count++; } atomic_sub(dep_count, &sbi->s_nr_dep_nodes); inode_urgent_unlock(dir); } void fsync_on_draining(struct inode *dir, struct inode *inode) { BUG_ON(!dir); BUG_ON(!inode_is_locked(dir)); BUG_ON(inode && !inode_is_locked(inode)); /* for link/unlink/rmdir */ if (inode) eufs_inode_mark_lock_transferable(inode); fsync_dir_oneshot(dir); if (inode) eufs_inode_wait_lock_transfer_done(inode); } #define NR_FLUSH_EACH_ROUND (16) #define FLUSH_START_THRESHOLD (64) static __always_inline int handle_persistees_for_each_cpu( struct super_block *sb, const struct cpumask *mask, int idx) { struct eufs_sb_info *sbi = EUFS_SB(sb); struct llist_node *list; struct llist_head *head; struct eufs_inode_info *vi; struct eufs_inode_info *next; int n_active_list; int cpu; bool need; retry: need = sbi->need_sync[idx]; n_active_list = 0; for_each_cpu(cpu, mask) { head = per_cpu_ptr(sbi->persistee_list, cpu); if (unlikely(llist_empty(head))) continue; n_active_list++; list = llist_del_all(head); eufs_dbg("persister get list %px for cpu%d\n", list, cpu); /* reverse the ordering for better locality? */ llist_for_each_entry_safe(vi, next, list, i_persistee_node) fsync_bg(&vi->vfs_inode); eufs_dbg("persister handled list %px\n", list); } /** * We need a complete round of run for fssync. If * need != sbi->need_sync[idx], need_sync was modified during our last * round. We need to retry to ensure a complete round of run. * It's okay if dirty inodes of a cpu is still being processed by * another persister, since we will wait for all persisters to finish * for fssync. */ if (need != READ_ONCE(sbi->need_sync[idx])) goto retry; if (need) { sbi->need_sync[idx] = false; wake_up(&sbi->sync_wq); } if (READ_ONCE(sbi->need_sync[idx])) goto retry; return n_active_list; } static int persister(void *data) { struct super_block *sb = data; struct eufs_sb_info *sbi = EUFS_SB(sb); const struct cpumask *mask = cpumask_of_node(numa_node_id()); const int period = (persist_period == 0) ? /* default */ (HZ / 4) : /* less than a second */ ((persist_period < 0) ? (HZ / (-persist_period)) : /* more than a second */ (HZ * persist_period)); int idx = 0; int num_persisters = num_sockets * persisters_per_socket; eufs_info("sb=%px cpu=%d cpumask=%*pbl period=%d\n", data, smp_processor_id(), cpumask_pr_args(mask), period); while (idx < num_persisters && sbi->persisters[idx] != current) idx++; BUG_ON(idx >= num_persisters); while (!kthread_should_stop()) { set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(period); handle_persistees_for_each_cpu(sb, mask, idx); } while (handle_persistees_for_each_cpu(sb, mask, idx)) cpu_relax(); eufs_info("finalizing on %d\n", smp_processor_id()); return 0; } int dep_init(struct super_block *sb) { struct eufs_sb_info *sbi = EUFS_SB(sb); int cpu; int i, j; char name[BDEVNAME_SIZE]; int err; sbi->persistee_list = alloc_percpu(struct llist_head); if (!sbi->persistee_list) { err = -ENOMEM; goto cleanup; } /* init each llist */ for_each_possible_cpu(cpu) init_llist_head(per_cpu_ptr(sbi->persistee_list, cpu)); sbi->persisters = kmalloc(sizeof(struct task_struct *) * persisters_per_socket * num_sockets, GFP_KERNEL); if (!sbi->persisters) { err = -ENOMEM; goto cleanup; } sbi->need_sync = kzalloc( sizeof(bool) * persisters_per_socket * num_sockets, GFP_KERNEL); if (!sbi->need_sync) { err = -ENOMEM; goto cleanup; } init_waitqueue_head(&sbi->sync_wq); bdevname(sb->s_bdev, name); for (i = 0; i < num_sockets; ++i) { for (j = 0; j < persisters_per_socket; ++j) { int idx = i * persisters_per_socket + j; sbi->persisters[idx] = kthread_create_on_node( persister, sb, i, "hmfs/%s-%d.%d", name, i, j); if (IS_ERR(sbi->persisters[idx])) { err = PTR_ERR(sbi->persisters[idx]); pr_err("create persister %s-%d.%d error %d", name, i, j, err); sbi->persisters[idx] = NULL; goto cleanup; } set_cpus_allowed_ptr(sbi->persisters[idx], cpumask_of_node(i)); wake_up_process(sbi->persisters[idx]); } } return 0; cleanup: dep_fini(sb); return err; } void dep_fini(struct super_block *sb) { struct eufs_sb_info *sbi = EUFS_SB(sb); if (sbi->persisters) { int i; for (i = 0; i < persisters_per_socket * num_sockets; ++i) { if (sbi->persisters[i]) { kthread_stop(sbi->persisters[i]); sbi->persisters[i] = NULL; } } kfree(sbi->persisters); sbi->persisters = NULL; } kfree(sbi->need_sync); sbi->need_sync = NULL; free_percpu(sbi->persistee_list); sbi->persistee_list = NULL; }