1. 06 12月, 2022 9 次提交
  2. 23 11月, 2022 1 次提交
    • F
      btrfs: do not modify log tree while holding a leaf from fs tree locked · 796787c9
      Filipe Manana 提交于
      When logging an inode in full mode, or when logging xattrs or when logging
      the dir index items of a directory, we are modifying the log tree while
      holding a read lock on a leaf from the fs/subvolume tree. This can lead to
      a deadlock in rare circumstances, but it is a real possibility, and it was
      recently reported by syzbot with the following trace from lockdep:
      
         WARNING: possible circular locking dependency detected
         6.1.0-rc5-next-20221116-syzkaller #0 Not tainted
         ------------------------------------------------------
         syz-executor.1/16154 is trying to acquire lock:
         ffff88807e3084a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256
      
         but task is already holding lock:
         ffff88807df33078 (btrfs-log-00){++++}-{3:3}, at: __btrfs_tree_lock+0x32/0x3d0 fs/btrfs/locking.c:197
      
         which lock already depends on the new lock.
      
         the existing dependency chain (in reverse order) is:
      
         -> #2 (btrfs-log-00){++++}-{3:3}:
                down_read_nested+0x9e/0x450 kernel/locking/rwsem.c:1634
                __btrfs_tree_read_lock+0x32/0x350 fs/btrfs/locking.c:135
                btrfs_tree_read_lock fs/btrfs/locking.c:141 [inline]
                btrfs_read_lock_root_node+0x82/0x3a0 fs/btrfs/locking.c:280
                btrfs_search_slot_get_root fs/btrfs/ctree.c:1678 [inline]
                btrfs_search_slot+0x3ca/0x2c70 fs/btrfs/ctree.c:1998
                btrfs_lookup_csum+0x116/0x3f0 fs/btrfs/file-item.c:209
                btrfs_csum_file_blocks+0x40e/0x1370 fs/btrfs/file-item.c:1021
                log_csums.isra.0+0x244/0x2d0 fs/btrfs/tree-log.c:4258
                copy_items.isra.0+0xbfb/0xed0 fs/btrfs/tree-log.c:4403
                copy_inode_items_to_log+0x13d6/0x1d90 fs/btrfs/tree-log.c:5873
                btrfs_log_inode+0xb19/0x4680 fs/btrfs/tree-log.c:6495
                btrfs_log_inode_parent+0x890/0x2a20 fs/btrfs/tree-log.c:6982
                btrfs_log_dentry_safe+0x59/0x80 fs/btrfs/tree-log.c:7083
                btrfs_sync_file+0xa41/0x13c0 fs/btrfs/file.c:1921
                vfs_fsync_range+0x13e/0x230 fs/sync.c:188
                generic_write_sync include/linux/fs.h:2856 [inline]
                iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128
                btrfs_direct_write fs/btrfs/file.c:1536 [inline]
                btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668
                call_write_iter include/linux/fs.h:2160 [inline]
                do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735
                do_iter_write+0x182/0x700 fs/read_write.c:861
                vfs_iter_write+0x74/0xa0 fs/read_write.c:902
                iter_file_splice_write+0x745/0xc90 fs/splice.c:686
                do_splice_from fs/splice.c:764 [inline]
                direct_splice_actor+0x114/0x180 fs/splice.c:931
                splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886
                do_splice_direct+0x1ab/0x280 fs/splice.c:974
                do_sendfile+0xb19/0x1270 fs/read_write.c:1255
                __do_sys_sendfile64 fs/read_write.c:1323 [inline]
                __se_sys_sendfile64 fs/read_write.c:1309 [inline]
                __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309
                do_syscall_x64 arch/x86/entry/common.c:50 [inline]
                do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
                entry_SYSCALL_64_after_hwframe+0x63/0xcd
      
         -> #1 (btrfs-tree-00){++++}-{3:3}:
                __lock_release kernel/locking/lockdep.c:5382 [inline]
                lock_release+0x371/0x810 kernel/locking/lockdep.c:5688
                up_write+0x2a/0x520 kernel/locking/rwsem.c:1614
                btrfs_tree_unlock_rw fs/btrfs/locking.h:189 [inline]
                btrfs_unlock_up_safe+0x1e3/0x290 fs/btrfs/locking.c:238
                search_leaf fs/btrfs/ctree.c:1832 [inline]
                btrfs_search_slot+0x265e/0x2c70 fs/btrfs/ctree.c:2074
                btrfs_insert_empty_items+0xbd/0x1c0 fs/btrfs/ctree.c:4133
                btrfs_insert_delayed_item+0x826/0xfa0 fs/btrfs/delayed-inode.c:746
                btrfs_insert_delayed_items fs/btrfs/delayed-inode.c:824 [inline]
                __btrfs_commit_inode_delayed_items fs/btrfs/delayed-inode.c:1111 [inline]
                __btrfs_run_delayed_items+0x280/0x590 fs/btrfs/delayed-inode.c:1153
                flush_space+0x147/0xe90 fs/btrfs/space-info.c:728
                btrfs_async_reclaim_metadata_space+0x541/0xc10 fs/btrfs/space-info.c:1086
                process_one_work+0x9bf/0x1710 kernel/workqueue.c:2289
                worker_thread+0x669/0x1090 kernel/workqueue.c:2436
                kthread+0x2e8/0x3a0 kernel/kthread.c:376
                ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
      
         -> #0 (&delayed_node->mutex){+.+.}-{3:3}:
                check_prev_add kernel/locking/lockdep.c:3097 [inline]
                check_prevs_add kernel/locking/lockdep.c:3216 [inline]
                validate_chain kernel/locking/lockdep.c:3831 [inline]
                __lock_acquire+0x2a43/0x56d0 kernel/locking/lockdep.c:5055
                lock_acquire kernel/locking/lockdep.c:5668 [inline]
                lock_acquire+0x1e3/0x630 kernel/locking/lockdep.c:5633
                __mutex_lock_common kernel/locking/mutex.c:603 [inline]
                __mutex_lock+0x12f/0x1360 kernel/locking/mutex.c:747
                __btrfs_release_delayed_node.part.0+0xa1/0xf30 fs/btrfs/delayed-inode.c:256
                __btrfs_release_delayed_node fs/btrfs/delayed-inode.c:251 [inline]
                btrfs_release_delayed_node fs/btrfs/delayed-inode.c:281 [inline]
                btrfs_remove_delayed_node+0x52/0x60 fs/btrfs/delayed-inode.c:1285
                btrfs_evict_inode+0x511/0xf30 fs/btrfs/inode.c:5554
                evict+0x2ed/0x6b0 fs/inode.c:664
                dispose_list+0x117/0x1e0 fs/inode.c:697
                prune_icache_sb+0xeb/0x150 fs/inode.c:896
                super_cache_scan+0x391/0x590 fs/super.c:106
                do_shrink_slab+0x464/0xce0 mm/vmscan.c:843
                shrink_slab_memcg mm/vmscan.c:912 [inline]
                shrink_slab+0x388/0x660 mm/vmscan.c:991
                shrink_node_memcgs mm/vmscan.c:6088 [inline]
                shrink_node+0x93d/0x1f30 mm/vmscan.c:6117
                shrink_zones mm/vmscan.c:6355 [inline]
                do_try_to_free_pages+0x3b4/0x17a0 mm/vmscan.c:6417
                try_to_free_mem_cgroup_pages+0x3a4/0xa70 mm/vmscan.c:6732
                reclaim_high.constprop.0+0x182/0x230 mm/memcontrol.c:2393
                mem_cgroup_handle_over_high+0x190/0x520 mm/memcontrol.c:2578
                try_charge_memcg+0xe0c/0x12f0 mm/memcontrol.c:2816
                try_charge mm/memcontrol.c:2827 [inline]
                charge_memcg+0x90/0x3b0 mm/memcontrol.c:6889
                __mem_cgroup_charge+0x2b/0x90 mm/memcontrol.c:6910
                mem_cgroup_charge include/linux/memcontrol.h:667 [inline]
                __filemap_add_folio+0x615/0xf80 mm/filemap.c:852
                filemap_add_folio+0xaf/0x1e0 mm/filemap.c:934
                __filemap_get_folio+0x389/0xd80 mm/filemap.c:1976
                pagecache_get_page+0x2e/0x280 mm/folio-compat.c:104
                find_or_create_page include/linux/pagemap.h:612 [inline]
                alloc_extent_buffer+0x2b9/0x1580 fs/btrfs/extent_io.c:4588
                btrfs_init_new_buffer fs/btrfs/extent-tree.c:4869 [inline]
                btrfs_alloc_tree_block+0x2e1/0x1320 fs/btrfs/extent-tree.c:4988
                __btrfs_cow_block+0x3b2/0x1420 fs/btrfs/ctree.c:440
                btrfs_cow_block+0x2fa/0x950 fs/btrfs/ctree.c:595
                btrfs_search_slot+0x11b0/0x2c70 fs/btrfs/ctree.c:2038
                btrfs_update_root+0xdb/0x630 fs/btrfs/root-tree.c:137
                update_log_root fs/btrfs/tree-log.c:2841 [inline]
                btrfs_sync_log+0xbfb/0x2870 fs/btrfs/tree-log.c:3064
                btrfs_sync_file+0xdb9/0x13c0 fs/btrfs/file.c:1947
                vfs_fsync_range+0x13e/0x230 fs/sync.c:188
                generic_write_sync include/linux/fs.h:2856 [inline]
                iomap_dio_complete+0x73a/0x920 fs/iomap/direct-io.c:128
                btrfs_direct_write fs/btrfs/file.c:1536 [inline]
                btrfs_do_write_iter+0xba2/0x1470 fs/btrfs/file.c:1668
                call_write_iter include/linux/fs.h:2160 [inline]
                do_iter_readv_writev+0x20b/0x3b0 fs/read_write.c:735
                do_iter_write+0x182/0x700 fs/read_write.c:861
                vfs_iter_write+0x74/0xa0 fs/read_write.c:902
                iter_file_splice_write+0x745/0xc90 fs/splice.c:686
                do_splice_from fs/splice.c:764 [inline]
                direct_splice_actor+0x114/0x180 fs/splice.c:931
                splice_direct_to_actor+0x335/0x8a0 fs/splice.c:886
                do_splice_direct+0x1ab/0x280 fs/splice.c:974
                do_sendfile+0xb19/0x1270 fs/read_write.c:1255
                __do_sys_sendfile64 fs/read_write.c:1323 [inline]
                __se_sys_sendfile64 fs/read_write.c:1309 [inline]
                __x64_sys_sendfile64+0x259/0x2c0 fs/read_write.c:1309
                do_syscall_x64 arch/x86/entry/common.c:50 [inline]
                do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80
                entry_SYSCALL_64_after_hwframe+0x63/0xcd
      
         other info that might help us debug this:
      
         Chain exists of:
           &delayed_node->mutex --> btrfs-tree-00 --> btrfs-log-00
      
         Possible unsafe locking scenario:
      
                CPU0                    CPU1
                ----                    ----
           lock(btrfs-log-00);
                                        lock(btrfs-tree-00);
                                        lock(btrfs-log-00);
           lock(&delayed_node->mutex);
      
      Holding a read lock on a leaf from a fs/subvolume tree creates a nasty
      lock dependency when we are COWing extent buffers for the log tree and we
      have two tasks modifying the log tree, with each one in one of the
      following 2 scenarios:
      
      1) Modifying the log tree triggers an extent buffer allocation while
         holding a write lock on a parent extent buffer from the log tree.
         Allocating the pages for an extent buffer, or the extent buffer
         struct, can trigger inode eviction and finally the inode eviction
         will trigger a release/remove of a delayed node, which requires
         taking the delayed node's mutex;
      
      2) Allocating a metadata extent for a log tree can trigger the async
         reclaim thread and make us wait for it to release enough space and
         unblock our reservation ticket. The reclaim thread can start flushing
         delayed items, and that in turn results in the need to lock delayed
         node mutexes and in the need to write lock extent buffers of a
         subvolume tree - all this while holding a write lock on the parent
         extent buffer in the log tree.
      
      So one task in scenario 1) running in parallel with another task in
      scenario 2) could lead to a deadlock, one wanting to lock a delayed node
      mutex while having a read lock on a leaf from the subvolume, while the
      other is holding the delayed node's mutex and wants to write lock the same
      subvolume leaf for flushing delayed items.
      
      Fix this by cloning the leaf of the fs/subvolume tree, release/unlock the
      fs/subvolume leaf and use the clone leaf instead.
      
      Reported-by: syzbot+9b7c21f486f5e7f8d029@syzkaller.appspotmail.com
      Link: https://lore.kernel.org/linux-btrfs/000000000000ccc93c05edc4d8cf@google.com/
      CC: stable@vger.kernel.org # 6.0+
      Reviewed-by: NJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      796787c9
  3. 29 9月, 2022 1 次提交
  4. 26 9月, 2022 14 次提交
    • J
      btrfs: unify the lock/unlock extent variants · 570eb97b
      Josef Bacik 提交于
      We have two variants of lock/unlock extent, one set that takes a cached
      state, another that does not.  This is slightly annoying, and generally
      speaking there are only a few places where we don't have a cached state.
      Simplify this by making lock_extent/unlock_extent the only variant and
      make it take a cached state, then convert all the callers appropriately.
      Signed-off-by: NJosef Bacik <josef@toxicpanda.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      570eb97b
    • F
      btrfs: simplify adding and replacing references during log replay · 7059c658
      Filipe Manana 提交于
      During log replay, when adding/replacing inode references, there are two
      special cases that have special code for them:
      
      1) When we have an inode with two or more hardlinks in the same directory,
         therefore two or more names encoded in the same inode reference item,
         and one of the hard links gets renamed to the old name of another hard
         link - that is, the index number for a name changes. This was added in
         commit 0d836392 ("Btrfs: fix mount failure after fsync due to
         hard link recreation"), and is covered by test case generic/502 from
         fstests;
      
      2) When we have several inodes that got renamed to an old name of some
         other inode, in a cascading style. The code to deal with this special
         case was added in commit 6b5fc433 ("Btrfs: fix fsync after
         succession of renames of different files"), and is covered by test
         cases generic/526 and generic/527 from fstests.
      
      Both cases can be deal with by making sure __add_inode_ref() is always
      called by add_inode_ref() for every name encoded in the inode reference
      item, and not just for the first name that has a conflict. With such
      change we no longer need that special casing for the two cases mentioned
      before. So do those changes.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      7059c658
    • F
      btrfs: use delayed items when logging a directory · 30b80f3c
      Filipe Manana 提交于
      When logging a directory we start by flushing all its delayed items.
      That results in adding dir index items to the subvolume btree, for new
      dentries, and removing dir index items from the subvolume btree for any
      dentries that were deleted.
      
      This makes it straightforward to log a directory simply by iterating over
      all the modified subvolume btree leaves, especially when we used to log
      both dir index keys and dir item keys (before commit 339d0354
      ("btrfs: only copy dir index keys when logging a directory") and when we
      used to copy old dir index entries for leaves modified in the current
      transaction (before commit 732d591a ("btrfs: stop copying old dir
      items when logging a directory")).
      
      From an efficiency point of view this has a couple of drawbacks:
      
      1) Adds extra latency, due to copying delayed items to the subvolume btree
         and deleting dir index items from the btree.
      
         Further if there are other tasks accessing the btree, which is common
         (syscalls like creat, mkdir, rename, link, unlink, truncate, reflinks,
         etc, finishing an ordered extent, etc), lock contention can cause
         further delays, both to the task logging a directory and to the other
         tasks accessing the btree;
      
      2) More time spent overall flushing delayed items, if after logging the
         directory further changes are done to the directory in the same
         transaction.
      
         For example, if we add 10 dentries to a directory, fsync it, add more
         10 dentries, fsync it again, then add more 10 dentries and fsync it
         again, then we end up inserting 3 batches of 10 items to the subvolume
         btree. With the changes from this patch, we flush all the delayed items
         to the btree only once - a single batch of 30 items, and outside the
         logging code (transaction commit or when delayed items are flushed
         asynchronously).
      
      This change simply skips the flushing of delayed items every time we log a
      directory. Instead we copy the delayed insertion items directly to the log
      tree and delete delayed deletion items directly from the log tree.
      Therefore avoiding changing first the subvolume btree and then scanning it
      for new items to copy from it to the log tree and detecting deletions
      by observing gaps in consecutive dir index keys in subvolume btree leaves.
      
      Running the following tests on a non-debug kernel (Debian's default kernel
      config), on a box with a NVMe device, a 12 cores Intel CPU and 64G of ram,
      produced the results below.
      
      The results compare a branch without this patch and all the other patches
      it depends on versus the same branch with the patchset applied.
      
      The patchset is comprised of the following patches:
      
        btrfs: don't drop dir index range items when logging a directory
        btrfs: remove the root argument from log_new_dir_dentries()
        btrfs: update stale comment for log_new_dir_dentries()
        btrfs: free list element sooner at log_new_dir_dentries()
        btrfs: avoid memory allocation at log_new_dir_dentries() for common case
        btrfs: remove root argument from btrfs_delayed_item_reserve_metadata()
        btrfs: store index number instead of key in struct btrfs_delayed_item
        btrfs: remove unused logic when looking up delayed items
        btrfs: shrink the size of struct btrfs_delayed_item
        btrfs: search for last logged dir index if it's not cached in the inode
        btrfs: move need_log_inode() to above log_conflicting_inodes()
        btrfs: move log_new_dir_dentries() above btrfs_log_inode()
        btrfs: log conflicting inodes without holding log mutex of the initial inode
        btrfs: skip logging parent dir when conflicting inode is not a dir
        btrfs: use delayed items when logging a directory
      
      Custom test script for testing time spent at btrfs_log_inode():
      
         #!/bin/bash
      
         DEV=/dev/nvme0n1
         MNT=/mnt/nvme0n1
      
         # Total number of files to create in the test directory.
         NUM_FILES=10000
         # Fsync after creating or renaming N files.
         FSYNC_AFTER=100
      
         umount $DEV &> /dev/null
         mkfs.btrfs -f $DEV
         mount -o ssd $DEV $MNT
      
         TEST_DIR=$MNT/testdir
         mkdir $TEST_DIR
      
         echo "Creating files..."
         for ((i = 1; i <= $NUM_FILES; i++)); do
                 echo -n > $TEST_DIR/file_$i
                 if (( ($i % $FSYNC_AFTER) == 0 )); then
                         xfs_io -c "fsync" $TEST_DIR
                 fi
         done
      
         sync
      
         echo "Renaming files..."
         for ((i = 1; i <= $NUM_FILES; i++)); do
                 mv $TEST_DIR/file_$i $TEST_DIR/file_$i.renamed
                 if (( ($i % $FSYNC_AFTER) == 0 )); then
                         xfs_io -c "fsync" $TEST_DIR
                 fi
         done
      
         umount $MNT
      
      And using the following bpftrace script to capture the total time that is
      spent at btrfs_log_inode():
      
         #!/usr/bin/bpftrace
      
         k:btrfs_log_inode
         {
                 @start_log_inode[tid] = nsecs;
         }
      
         kr:btrfs_log_inode
         /@start_log_inode[tid]/
         {
                 $dur = (nsecs - @start_log_inode[tid]) / 1000;
                 @btrfs_log_inode_total_time = sum($dur);
                 delete(@start_log_inode[tid]);
         }
      
         END
         {
                 clear(@start_log_inode);
         }
      
      Result before applying patchset:
      
         @btrfs_log_inode_total_time: 622642
      
      Result after applying patchset:
      
         @btrfs_log_inode_total_time: 354134    (-43.1% time spent)
      
      The following dbench script was also used for testing:
      
         #!/bin/bash
      
         NUM_JOBS=$(nproc --all)
      
         DEV=/dev/nvme0n1
         MNT=/mnt/nvme0n1
         MOUNT_OPTIONS="-o ssd"
         MKFS_OPTIONS="-O no-holes -R free-space-tree"
      
         echo "performance" | \
             tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
      
         umount $DEV &> /dev/null
         mkfs.btrfs -f $MKFS_OPTIONS $DEV
         mount $MOUNT_OPTIONS $DEV $MNT
      
         dbench -D $MNT --skip-cleanup -t 120 -S $NUM_JOBS
      
         umount $MNT
      
      Before patchset:
      
       Operation      Count    AvgLat    MaxLat
       ----------------------------------------
       NTCreateX    3322265     0.034    21.032
       Close        2440562     0.002     0.994
       Rename        140664     1.150   269.633
       Unlink        670796     1.093   269.678
       Deltree           96     5.481    15.510
       Mkdir             48     0.004     0.052
       Qpathinfo    3010924     0.014     8.127
       Qfileinfo     528055     0.001     0.518
       Qfsinfo       552113     0.003     0.372
       Sfileinfo     270575     0.005     0.688
       Find         1164176     0.052    13.931
       WriteX       1658537     0.019     5.918
       ReadX        5207412     0.003     1.034
       LockX          10818     0.003     0.079
       UnlockX        10818     0.002     0.313
       Flush         232811     1.027   269.735
      
      Throughput 869.867 MB/sec (sync dirs)  12 clients  12 procs  max_latency=269.741 ms
      
      After patchset:
      
       Operation      Count    AvgLat    MaxLat
       ----------------------------------------
       NTCreateX    4152738     0.029    20.863
       Close        3050770     0.002     1.119
       Rename        175829     0.871   211.741
       Unlink        838447     0.845   211.724
       Deltree          120     4.798    14.162
       Mkdir             60     0.003     0.005
       Qpathinfo    3763807     0.011     4.673
       Qfileinfo     660111     0.001     0.400
       Qfsinfo       690141     0.003     0.429
       Sfileinfo     338260     0.005     0.725
       Find         1455273     0.046     6.787
       WriteX       2073307     0.017     5.690
       ReadX        6509193     0.003     1.171
       LockX          13522     0.003     0.077
       UnlockX        13522     0.002     0.125
       Flush         291044     0.811   211.631
      
      Throughput 1089.27 MB/sec (sync dirs)  12 clients  12 procs  max_latency=211.750 ms
      
      (+25.2% throughput, -21.5% max latency)
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      30b80f3c
    • F
      btrfs: skip logging parent dir when conflicting inode is not a dir · 5557a069
      Filipe Manana 提交于
      When we find a conflicting inode (an inode that had the same name and
      parent directory as the inode we are logging now) that was deleted in the
      current transaction, we always end up logging its parent directory.
      
      This is to deal with the case where the conflicting inode corresponds to
      a deleted subvolume/snapshot or a directory that had subvolumes/snapshots
      (or some subdirectory inside it had subvolumes/snapshots, etc), because
      we can't deal with dropping subvolumes/snapshots during log replay. So
      if we log the parent directory, and if we are dealing with these special
      cases, then we fallback to a transaction commit when logging the parent,
      because its last_unlink_trans will match the current transaction (which
      gets set and propagated when a subvolume/snapshot is deleted).
      
      This change skips the logging of the parent directory when the conflicting
      inode is not a directory (or a subvolume/snapshot). This is ok because in
      this case logging the current inode is enough to trigger an unlink of the
      conflicting inode during log replay.
      
      So for a case like this:
      
        $ mkdir /mnt/dir
        $ echo -n "first foo data" > /mnt/dir/foo
      
        $ sync
      
        $ rm -f /mnt/dir/foo
        $ echo -n "second foo data" > /mnt/dir/foo
        $ xfs_io -c "fsync" /mnt/dir/foo
      
      We avoid logging parent directory "dir" when logging the new file "foo".
      In other cases it avoids falling back to a transaction commit, when the
      parent directory has a last_unlink_trans value that matches the current
      transaction, due to moving a file from it to some other directory.
      
      This is a case that happens frequently with dbench for example, where a
      new file that has the name/parent of another file that was deleted in the
      current transaction, is fsynced.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      5557a069
    • F
      btrfs: log conflicting inodes without holding log mutex of the initial inode · e09d94c9
      Filipe Manana 提交于
      When logging an inode, if we detect the inode has a reference that
      conflicts with some other inode that got renamed, we log that other inode
      while holding the log mutex of the current inode. We then find out if
      there are other inodes that conflict with the first conflicting inode,
      and log them while under the log mutex of the original inode. This is
      fine because the recursion can only happen once.
      
      For the upcoming work where we directly log delayed items without flushing
      them first to the subvolume tree, this recursion adds a lot of complexity
      and it's hard to keep lockdep happy about it.
      
      So collect a list of conflicting inodes and then log the inodes after
      unlocking the log mutex of the inode we started with.
      
      Also limit the maximum number of conflict inodes we log to 10, to avoid
      spending too much time logging (and maybe allocating too many list
      elements too), as typically we don't have more than 1 or 2 conflicting
      inodes - if we go over the limit, simply fallback to a transaction commit.
      
      It is possible to have a very long list of conflicting inodes to be
      intentionally created by a user if he/she creates a very long succession
      of renames like this:
      
        (...)
        rename E to F
        rename D to E
        rename C to D
        rename B to C
        rename A to B
        touch A (create a new file named A)
        fsync A
      
      If that happened for a sequence of hundreds or thousands of renames, it
      could massively slow down the logging and cause other secondary effects
      like for example blocking other fsync operations and transaction commits
      for a very long time (assuming it wouldn't run into -ENOSPC or -ENOMEM
      first). However such cases are very uncommon to happen in practice,
      nevertheless it's better to be prepared for them and avoid chaos.
      Such long sequence of conflicting inodes could be created before this
      change.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      e09d94c9
    • F
      btrfs: move log_new_dir_dentries() above btrfs_log_inode() · f6d86dbe
      Filipe Manana 提交于
      The static function log_new_dir_dentries() is currently defined below
      btrfs_log_inode(), but in an upcoming patch a new function is introduced
      that is called by btrfs_log_inode() and this new function needs to call
      log_new_dir_dentries(). So move log_new_dir_dentries() to a location
      between btrfs_log_inode() and need_log_inode() (the later is called by
      log_new_dir_dentries()).
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      f6d86dbe
    • F
      btrfs: move need_log_inode() to above log_conflicting_inodes() · a3751024
      Filipe Manana 提交于
      The static function need_log_inode() is defined below btrfs_log_inode()
      and log_conflicting_inodes(), but in the next patches in the series we
      will need to call need_log_inode() in a couple new functions that will be
      used by btrfs_log_inode(). So move its definition to a location above
      log_conflicting_inodes().
      
      Also make its arguments 'const', since they are not supposed to be
      modified.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      a3751024
    • F
      btrfs: search for last logged dir index if it's not cached in the inode · 193df624
      Filipe Manana 提交于
      The key offset of the last dir index item that was logged is stored in
      the inode's last_dir_index_offset field. However that field is not
      persisted in the inode item or elsewhere, so if the inode gets evicted
      and reloaded, it gets a value of (u64)-1, so that when we are logging
      dir index items we check if they were logged before, to avoid attempts
      to insert duplicated keys and fallback to a transaction commit.
      
      Improve on this by searching for the last dir index that was logged when
      we start logging a directory if the inode's last_dir_index_offset is not
      set (has a value of (u64)-1) and it was logged before. This avoids
      checking if each dir index item we find was already logged before, and
      simplifies the logging of dir index items (process_dir_items_leaf()).
      
      This will also be needed for an incoming change where we start logging
      delayed items directly, without flushing them first.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      193df624
    • F
      btrfs: avoid memory allocation at log_new_dir_dentries() for common case · 009d9bea
      Filipe Manana 提交于
      At log_new_dir_dentries() we always start by allocating a list element
      for the starting inode and then do a while loop with the condition being
      a list emptiness check.
      
      This however is not needed, we can avoid allocating this initial list
      element and then just check for the list emptiness at the end of the
      loop's body. So just do that to save one memory allocation from the
      kmalloc-32 slab.
      
      This allows for not doing any memory allocation when we don't have any
      subdirectory to log, which is a very common case.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      009d9bea
    • F
      btrfs: free list element sooner at log_new_dir_dentries() · 40084813
      Filipe Manana 提交于
      At log_new_dir_dentries(), there's no need to keep the current list
      element allocated while processing the leaves with directory items for
      the current directory, and while logging other inodes. Plus in case we
      find a subdirectory, we also end up allocating a new list element while
      the current one is still allocated, temporarily using more memory than
      necessary.
      
      So free the current list element early on, before processing leaves.
      Also make the removal and release of all list elements in case of an
      error more simple by eliminating the label and goto, adding an explicit
      loop to release all list elements in case an error happens.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      40084813
    • F
      btrfs: update stale comment for log_new_dir_dentries() · b96c552b
      Filipe Manana 提交于
      The comment refers to the function log_dir_items() in order to check why
      the inodes of new directory entries need to be logged, but the relevant
      comments are no longer at log_dir_items(), they were moved to the function
      process_dir_items_leaf() in commit eb10d85e ("btrfs: factor out the
      copying loop of dir items from log_dir_items()"). So update it with the
      current function name.
      
      Also remove references with i_mutex to "VFS lock", since the inode lock
      is no longer a mutex since 2016 (it's now a rw semaphore).
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      b96c552b
    • F
      btrfs: remove the root argument from log_new_dir_dentries() · 8786a6d7
      Filipe Manana 提交于
      There's no point in passing a root argument to log_new_dir_dentries()
      because it always corresponds to the root of the given inode. So remove
      it and extract the root from the given inode.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      8786a6d7
    • F
      btrfs: don't drop dir index range items when logging a directory · 04fc7d51
      Filipe Manana 提交于
      When logging a directory that was previously logged in the current
      transaction, we drop all the range items (BTRFS_DIR_LOG_INDEX_KEY key
      type). This is because we will process all leaves in the subvolume's tree
      that were changed in the current transaction and then add range items for
      covering new dir index items and deleted dir index items, which could
      cover now a larger range than before.
      
      We used to fail if we tried to insert a range item key that already
      exists, so we dropped all range items to avoid failing. However nowadays,
      since commit 750ee454 ("btrfs: fix assertion failure when logging
      directory key range item"), we simply update any range item that already
      exists, increasing its range's last dir index if needed. Since the range
      covered by a range item can never decrease, due to the fact that dir index
      values come from a monotonically increasing counter and are never reused,
      we can stop dropping all range items before we start logging a directory.
      By not dropping the items we can avoid having occasional tree rebalance
      operations.
      
      This will also be needed for an incoming change where we start logging
      delayed items directly, without flushing them first.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      04fc7d51
    • O
      btrfs: rename btrfs_insert_file_extent() to btrfs_insert_hole_extent() · d1f68ba0
      Omar Sandoval 提交于
      btrfs_insert_file_extent() is only ever used to insert holes, so rename
      it and remove the redundant parameters.
      Reviewed-by: NQu Wenruo <wqu@suse.com>
      Signed-off-by: NOmar Sandoval <osandov@osandov.com>
      Signed-off-by: NSweet Tea Dorminy <sweettea-kernel@dorminy.me>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      d1f68ba0
  5. 17 8月, 2022 2 次提交
    • F
      btrfs: fix warning during log replay when bumping inode link count · 769030e1
      Filipe Manana 提交于
      During log replay, at add_link(), we may increment the link count of
      another inode that has a reference that conflicts with a new reference
      for the inode currently being processed.
      
      During log replay, at add_link(), we may drop (unlink) a reference from
      some inode in the subvolume tree if that reference conflicts with a new
      reference found in the log for the inode we are currently processing.
      
      After the unlink, If the link count has decreased from 1 to 0, then we
      increment the link count to prevent the inode from being deleted if it's
      evicted by an iput() call, because we may have references to add to that
      inode later on (and we will fixup its link count later during log replay).
      
      However incrementing the link count from 0 to 1 triggers a warning:
      
        $ cat fs/inode.c
        (...)
        void inc_nlink(struct inode *inode)
        {
              if (unlikely(inode->i_nlink == 0)) {
                       WARN_ON(!(inode->i_state & I_LINKABLE));
                       atomic_long_dec(&inode->i_sb->s_remove_count);
              }
        (...)
      
      The I_LINKABLE flag is only set when creating an O_TMPFILE file, so it's
      never set during log replay.
      
      Most of the time, the warning isn't triggered even if we dropped the last
      reference of the conflicting inode, and this is because:
      
      1) The conflicting inode was previously marked for fixup, through a call
         to link_to_fixup_dir(), which increments the inode's link count;
      
      2) And the last iput() on the inode has not triggered eviction of the
         inode, nor was eviction triggered after the iput(). So at add_link(),
         even if we unlink the last reference of the inode, its link count ends
         up being 1 and not 0.
      
      So this means that if eviction is triggered after link_to_fixup_dir() is
      called, at add_link() we will read the inode back from the subvolume tree
      and have it with a correct link count, matching the number of references
      it has on the subvolume tree. So if when we are at add_link() the inode
      has exactly one reference only, its link count is 1, and after the unlink
      its link count becomes 0.
      
      So fix this by using set_nlink() instead of inc_nlink(), as the former
      accepts a transition from 0 to 1 and it's what we use in other similar
      contexts (like at link_to_fixup_dir().
      
      Also make add_inode_ref() use set_nlink() instead of inc_nlink() to
      bump the link count from 0 to 1.
      
      The warning is actually harmless, but it may scare users. Josef also ran
      into it recently.
      
      CC: stable@vger.kernel.org # 5.1+
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      769030e1
    • F
      btrfs: fix lost error handling when looking up extended ref on log replay · 7a6b75b7
      Filipe Manana 提交于
      During log replay, when processing inode references, if we get an error
      when looking up for an extended reference at __add_inode_ref(), we ignore
      it and proceed, returning success (0) if no other error happens after the
      lookup. This is obviously wrong because in case an extended reference
      exists and it encodes some name not in the log, we need to unlink it,
      otherwise the filesystem state will not match the state it had after the
      last fsync.
      
      So just make __add_inode_ref() return an error it gets from the extended
      reference lookup.
      
      Fixes: f186373f ("btrfs: extended inode refs")
      CC: stable@vger.kernel.org # 4.9+
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      7a6b75b7
  6. 25 7月, 2022 3 次提交
    • F
      btrfs: join running log transaction when logging new name · 723df2bc
      Filipe Manana 提交于
      When logging a new name, in case of a rename, we pin the log before
      changing it. We then either delete a directory entry from the log or
      insert a key range item to mark the old name for deletion on log replay.
      
      However when doing one of those log changes we may have another task that
      started writing out the log (at btrfs_sync_log()) and it started before
      we pinned the log root. So we may end up changing a log tree while its
      writeback is being started by another task syncing the log. This can lead
      to inconsistencies in a log tree and other unexpected results during log
      replay, because we can get some committed node pointing to a node/leaf
      that ends up not getting written to disk before the next log commit.
      
      The problem, conceptually, started to happen in commit 88d2beec
      ("btrfs: avoid logging all directory changes during renames"), because
      there we started to update the log without joining its current transaction
      first.
      
      However the problem only became visible with commit 259c4b96
      ("btrfs: stop doing unnecessary log updates during a rename"), and that is
      because we used to pin the log at btrfs_rename() and then before entering
      btrfs_log_new_name(), when unlinking the old dentry, we ended up at
      btrfs_del_inode_ref_in_log() and btrfs_del_dir_entries_in_log(). Both
      of them join the current log transaction, effectively waiting for any log
      transaction writeout (due to acquiring the root's log_mutex). This made it
      safe even after leaving the current log transaction, because we remained
      with the log pinned when we called btrfs_log_new_name().
      
      Then in commit 259c4b96 ("btrfs: stop doing unnecessary log updates
      during a rename"), we removed the log pinning from btrfs_rename() and
      stopped calling btrfs_del_inode_ref_in_log() and
      btrfs_del_dir_entries_in_log() during the rename, and started to do all
      the needed work at btrfs_log_new_name(), but without joining the current
      log transaction, only pinning the log, which is racy because another task
      may have started writeout of the log tree right before we pinned the log.
      
      Both commits landed in kernel 5.18, so it doesn't make any practical
      difference which should be blamed, but I'm blaming the second commit only
      because with the first one, by chance, the problem did not happen due to
      the fact we joined the log transaction after pinning the log and unpinned
      it only after calling btrfs_log_new_name().
      
      So make btrfs_log_new_name() join the current log transaction instead of
      pinning it, so that we never do log updates if it's writeout is starting.
      
      Fixes: 259c4b96 ("btrfs: stop doing unnecessary log updates during a rename")
      CC: stable@vger.kernel.org # 5.18+
      Reported-by: NZygo Blaxell <ce3g8jdj@umail.furryterror.org>
      Tested-by: NZygo Blaxell <ce3g8jdj@umail.furryterror.org>
      Reviewed-by: NJosef Bacik <josef@toxicpanda.com>
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      723df2bc
    • J
      btrfs: tree-log: make the return value for log syncing consistent · f31f09f6
      Josef Bacik 提交于
      Currently we will return 1 or -EAGAIN if we decide we need to commit
      the transaction rather than sync the log.  In practice this doesn't
      really matter, we interpret any !0 and !BTRFS_NO_LOG_SYNC as needing to
      commit the transaction.  However this makes it hard to figure out what
      the correct thing to do is.
      
      Fix this up by defining BTRFS_LOG_FORCE_COMMIT and using this in all the
      places where we want to force the transaction to be committed.
      
      CC: stable@vger.kernel.org # 5.15+
      Reviewed-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NJosef Bacik <josef@toxicpanda.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      f31f09f6
    • D
      btrfs: fix typos in comments · 143823cf
      David Sterba 提交于
      Codespell has found a few typos.
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      143823cf
  7. 16 5月, 2022 2 次提交
  8. 06 5月, 2022 1 次提交
    • F
      btrfs: fix assertion failure when logging directory key range item · 750ee454
      Filipe Manana 提交于
      When inserting a key range item (BTRFS_DIR_LOG_INDEX_KEY) while logging
      a directory, we don't expect the insertion to fail with -EEXIST, because
      we are holding the directory's log_mutex and we have dropped all existing
      BTRFS_DIR_LOG_INDEX_KEY keys from the log tree before we started to log
      the directory. However it's possible that during the logging we attempt
      to insert the same BTRFS_DIR_LOG_INDEX_KEY key twice, but for this to
      happen we need to race with insertions of items from other inodes in the
      subvolume's tree while we are logging a directory. Here's how this can
      happen:
      
      1) We are logging a directory with inode number 1000 that has its items
         spread across 3 leaves in the subvolume's tree:
      
         leaf A - has index keys from the range 2 to 20 for example. The last
         item in the leaf corresponds to a dir item for index number 20. All
         these dir items were created in a past transaction.
      
         leaf B - has index keys from the range 22 to 100 for example. It has
         no keys from other inodes, all its keys are dir index keys for our
         directory inode number 1000. Its first key is for the dir item with
         a sequence number of 22. All these dir items were also created in a
         past transaction.
      
         leaf C - has index keys for our directory for the range 101 to 120 for
         example. This leaf also has items from other inodes, and its first
         item corresponds to the dir item for index number 101 for our directory
         with inode number 1000;
      
      2) When we finish processing the items from leaf A at log_dir_items(),
         we log a BTRFS_DIR_LOG_INDEX_KEY key with an offset of 21 and a last
         offset of 21, meaning the log is authoritative for the index range
         from 21 to 21 (a single sequence number). At this point leaf B was
         not yet modified in the current transaction;
      
      3) When we return from log_dir_items() we have released our read lock on
         leaf B, and have set *last_offset_ret to 21 (index number of the first
         item on leaf B minus 1);
      
      4) Some other task inserts an item for other inode (inode number 1001 for
         example) into leaf C. That resulted in pushing some items from leaf C
         into leaf B, in order to make room for the new item, so now leaf B
         has dir index keys for the sequence number range from 22 to 102 and
         leaf C has the dir items for the sequence number range 103 to 120;
      
      5) At log_directory_changes() we call log_dir_items() again, passing it
         a 'min_offset' / 'min_key' value of 22 (*last_offset_ret from step 3
         plus 1, so 21 + 1). Then btrfs_search_forward() leaves us at slot 0
         of leaf B, since leaf B was modified in the current transaction.
      
         We have also initialized 'last_old_dentry_offset' to 20 after calling
         btrfs_previous_item() at log_dir_items(), as it left us at the last
         item of leaf A, which refers to the dir item with sequence number 20;
      
      6) We then call process_dir_items_leaf() to process the dir items of
         leaf B, and when we process the first item, corresponding to slot 0,
         sequence number 22, we notice the dir item was created in a past
         transaction and its sequence number is greater than the value of
         *last_old_dentry_offset + 1 (20 + 1), so we decide to log again a
         BTRFS_DIR_LOG_INDEX_KEY key with an offset of 21 and an end range
         of 21 (key.offset - 1 == 22 - 1 == 21), which results in an -EEXIST
         error from insert_dir_log_key(), as we have already inserted that
         key at step 2, triggering the assertion at process_dir_items_leaf().
      
      The trace produced in dmesg is like the following:
      
      assertion failed: ret != -EEXIST, in fs/btrfs/tree-log.c:3857
      [198255.980839][ T7460] ------------[ cut here ]------------
      [198255.981666][ T7460] kernel BUG at fs/btrfs/ctree.h:3617!
      [198255.983141][ T7460] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
      [198255.984080][ T7460] CPU: 0 PID: 7460 Comm: repro-ghost-dir Not tainted 5.18.0-5314c78ac373-misc-next+
      [198255.986027][ T7460] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
      [198255.988600][ T7460] RIP: 0010:assertfail.constprop.0+0x1c/0x1e
      [198255.989465][ T7460] Code: 8b 4c 89 (...)
      [198255.992599][ T7460] RSP: 0018:ffffc90007387188 EFLAGS: 00010282
      [198255.993414][ T7460] RAX: 000000000000003d RBX: 0000000000000065 RCX: 0000000000000000
      [198255.996056][ T7460] RDX: 0000000000000001 RSI: ffffffff8b62b180 RDI: fffff52000e70e24
      [198255.997668][ T7460] RBP: ffffc90007387188 R08: 000000000000003d R09: ffff8881f0e16507
      [198255.999199][ T7460] R10: ffffed103e1c2ca0 R11: 0000000000000001 R12: 00000000ffffffef
      [198256.000683][ T7460] R13: ffff88813befc630 R14: ffff888116c16e70 R15: ffffc90007387358
      [198256.007082][ T7460] FS:  00007fc7f7c24640(0000) GS:ffff8881f0c00000(0000) knlGS:0000000000000000
      [198256.009939][ T7460] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
      [198256.014133][ T7460] CR2: 0000560bb16d0b78 CR3: 0000000140b34005 CR4: 0000000000170ef0
      [198256.015239][ T7460] Call Trace:
      [198256.015674][ T7460]  <TASK>
      [198256.016313][ T7460]  log_dir_items.cold+0x16/0x2c
      [198256.018858][ T7460]  ? replay_one_extent+0xbf0/0xbf0
      [198256.025932][ T7460]  ? release_extent_buffer+0x1d2/0x270
      [198256.029658][ T7460]  ? rcu_read_lock_sched_held+0x16/0x80
      [198256.031114][ T7460]  ? lock_acquired+0xbe/0x660
      [198256.032633][ T7460]  ? rcu_read_lock_sched_held+0x16/0x80
      [198256.034386][ T7460]  ? lock_release+0xcf/0x8a0
      [198256.036152][ T7460]  log_directory_changes+0xf9/0x170
      [198256.036993][ T7460]  ? log_dir_items+0xba0/0xba0
      [198256.037661][ T7460]  ? do_raw_write_unlock+0x7d/0xe0
      [198256.038680][ T7460]  btrfs_log_inode+0x233b/0x26d0
      [198256.041294][ T7460]  ? log_directory_changes+0x170/0x170
      [198256.042864][ T7460]  ? btrfs_attach_transaction_barrier+0x60/0x60
      [198256.045130][ T7460]  ? rcu_read_lock_sched_held+0x16/0x80
      [198256.046568][ T7460]  ? lock_release+0xcf/0x8a0
      [198256.047504][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.048712][ T7460]  ? ilookup5_nowait+0x81/0xa0
      [198256.049747][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.050652][ T7460]  ? do_raw_spin_unlock+0xa9/0x100
      [198256.051618][ T7460]  ? __might_resched+0x128/0x1c0
      [198256.052511][ T7460]  ? __might_sleep+0x66/0xc0
      [198256.053442][ T7460]  ? __kasan_check_read+0x11/0x20
      [198256.054251][ T7460]  ? iget5_locked+0xbd/0x150
      [198256.054986][ T7460]  ? run_delayed_iput_locked+0x110/0x110
      [198256.055929][ T7460]  ? btrfs_iget+0xc7/0x150
      [198256.056630][ T7460]  ? btrfs_orphan_cleanup+0x4a0/0x4a0
      [198256.057502][ T7460]  ? free_extent_buffer+0x13/0x20
      [198256.058322][ T7460]  btrfs_log_inode+0x2654/0x26d0
      [198256.059137][ T7460]  ? log_directory_changes+0x170/0x170
      [198256.060020][ T7460]  ? rcu_read_lock_sched_held+0x16/0x80
      [198256.060930][ T7460]  ? rcu_read_lock_sched_held+0x16/0x80
      [198256.061905][ T7460]  ? lock_contended+0x770/0x770
      [198256.062682][ T7460]  ? btrfs_log_inode_parent+0xd04/0x1750
      [198256.063582][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.064432][ T7460]  ? preempt_count_sub+0x18/0xc0
      [198256.065550][ T7460]  ? __mutex_lock+0x580/0xdc0
      [198256.066654][ T7460]  ? stack_trace_save+0x94/0xc0
      [198256.068008][ T7460]  ? __kasan_check_write+0x14/0x20
      [198256.072149][ T7460]  ? __mutex_unlock_slowpath+0x12a/0x430
      [198256.073145][ T7460]  ? mutex_lock_io_nested+0xcd0/0xcd0
      [198256.074341][ T7460]  ? wait_for_completion_io_timeout+0x20/0x20
      [198256.075345][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.076142][ T7460]  ? lock_contended+0x770/0x770
      [198256.076939][ T7460]  ? do_raw_spin_lock+0x1c0/0x1c0
      [198256.078401][ T7460]  ? btrfs_sync_file+0x5e6/0xa40
      [198256.080598][ T7460]  btrfs_log_inode_parent+0x523/0x1750
      [198256.081991][ T7460]  ? wait_current_trans+0xc8/0x240
      [198256.083320][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.085450][ T7460]  ? btrfs_end_log_trans+0x70/0x70
      [198256.086362][ T7460]  ? rcu_read_lock_sched_held+0x16/0x80
      [198256.087544][ T7460]  ? lock_release+0xcf/0x8a0
      [198256.088305][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.090375][ T7460]  ? dget_parent+0x8e/0x300
      [198256.093538][ T7460]  ? do_raw_spin_lock+0x1c0/0x1c0
      [198256.094918][ T7460]  ? lock_downgrade+0x420/0x420
      [198256.097815][ T7460]  ? do_raw_spin_unlock+0xa9/0x100
      [198256.101822][ T7460]  ? dget_parent+0xb7/0x300
      [198256.103345][ T7460]  btrfs_log_dentry_safe+0x48/0x60
      [198256.105052][ T7460]  btrfs_sync_file+0x629/0xa40
      [198256.106829][ T7460]  ? start_ordered_ops.constprop.0+0x120/0x120
      [198256.109655][ T7460]  ? __fget_files+0x161/0x230
      [198256.110760][ T7460]  vfs_fsync_range+0x6d/0x110
      [198256.111923][ T7460]  ? start_ordered_ops.constprop.0+0x120/0x120
      [198256.113556][ T7460]  __x64_sys_fsync+0x45/0x70
      [198256.114323][ T7460]  do_syscall_64+0x5c/0xc0
      [198256.115084][ T7460]  ? syscall_exit_to_user_mode+0x3b/0x50
      [198256.116030][ T7460]  ? do_syscall_64+0x69/0xc0
      [198256.116768][ T7460]  ? do_syscall_64+0x69/0xc0
      [198256.117555][ T7460]  ? do_syscall_64+0x69/0xc0
      [198256.118324][ T7460]  ? sysvec_call_function_single+0x57/0xc0
      [198256.119308][ T7460]  ? asm_sysvec_call_function_single+0xa/0x20
      [198256.120363][ T7460]  entry_SYSCALL_64_after_hwframe+0x44/0xae
      [198256.121334][ T7460] RIP: 0033:0x7fc7fe97b6ab
      [198256.122067][ T7460] Code: 0f 05 48 (...)
      [198256.125198][ T7460] RSP: 002b:00007fc7f7c23950 EFLAGS: 00000293 ORIG_RAX: 000000000000004a
      [198256.126568][ T7460] RAX: ffffffffffffffda RBX: 00007fc7f7c239f0 RCX: 00007fc7fe97b6ab
      [198256.127942][ T7460] RDX: 0000000000000002 RSI: 000056167536bcf0 RDI: 0000000000000004
      [198256.129302][ T7460] RBP: 0000000000000004 R08: 0000000000000000 R09: 000000007ffffeb8
      [198256.130670][ T7460] R10: 00000000000001ff R11: 0000000000000293 R12: 0000000000000001
      [198256.132046][ T7460] R13: 0000561674ca8140 R14: 00007fc7f7c239d0 R15: 000056167536dab8
      [198256.133403][ T7460]  </TASK>
      
      Fix this by treating -EEXIST as expected at insert_dir_log_key() and have
      it update the item with an end offset corresponding to the maximum between
      the previously logged end offset and the new requested end offset. The end
      offsets may be different due to dir index key deletions that happened as
      part of unlink operations while we are logging a directory (triggered when
      fsyncing some other inode parented by the directory) or during renames
      which always attempt to log a single dir index deletion.
      Reported-by: NZygo Blaxell <ce3g8jdj@umail.furryterror.org>
      Link: https://lore.kernel.org/linux-btrfs/YmyefE9mc2xl5ZMz@hungrycats.org/
      Fixes: 732d591a ("btrfs: stop copying old dir items when logging a directory")
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      750ee454
  9. 28 4月, 2022 1 次提交
    • F
      btrfs: always log symlinks in full mode · d0e64a98
      Filipe Manana 提交于
      On Linux, empty symlinks are invalid, and attempting to create one with
      the system call symlink(2) results in an -ENOENT error and this is
      explicitly documented in the man page.
      
      If we rename a symlink that was created in the current transaction and its
      parent directory was logged before, we actually end up logging the symlink
      without logging its content, which is stored in an inline extent. That
      means that after a power failure we can end up with an empty symlink,
      having no content and an i_size of 0 bytes.
      
      It can be easily reproduced like this:
      
        $ mkfs.btrfs -f /dev/sdc
        $ mount /dev/sdc /mnt
      
        $ mkdir /mnt/testdir
        $ sync
      
        # Create a file inside the directory and fsync the directory.
        $ touch /mnt/testdir/foo
        $ xfs_io -c "fsync" /mnt/testdir
      
        # Create a symlink inside the directory and then rename the symlink.
        $ ln -s /mnt/testdir/foo /mnt/testdir/bar
        $ mv /mnt/testdir/bar /mnt/testdir/baz
      
        # Now fsync again the directory, this persist the log tree.
        $ xfs_io -c "fsync" /mnt/testdir
      
        <power failure>
      
        $ mount /dev/sdc /mnt
        $ stat -c %s /mnt/testdir/baz
        0
        $ readlink /mnt/testdir/baz
        $
      
      Fix this by always logging symlinks in full mode (LOG_INODE_ALL), so that
      their content is also logged.
      
      A test case for fstests will follow.
      
      CC: stable@vger.kernel.org # 4.9+
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      d0e64a98
  10. 19 4月, 2022 1 次提交
  11. 14 3月, 2022 5 次提交
    • F
      btrfs: add and use helper for unlinking inode during log replay · 313ab753
      Filipe Manana 提交于
      During log replay there is this pattern of running delayed items after
      every inode unlink. To avoid repeating this several times, move the
      logic into an helper function and use it instead of calling
      btrfs_unlink_inode() followed by btrfs_run_delayed_items().
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      313ab753
    • F
      btrfs: reset last_reflink_trans after fsyncing inode · 23e3337f
      Filipe Manana 提交于
      When an inode has a last_reflink_trans matching the current transaction,
      we have to take special care when logging its checksums in order to
      avoid getting checksum items with overlapping ranges in a log tree,
      which could result in missing checksums after log replay (more on that
      in the changelogs of commit 40e046ac ("Btrfs: fix missing data
      checksums after replaying a log tree") and commit e289f03e ("btrfs:
      fix corrupt log due to concurrent fsync of inodes with shared extents")).
      We also need to make sure a full fsync will copy all old file extent
      items it finds in modified leaves, because they might have been copied
      from some other inode.
      
      However once we fsync an inode, we don't need to keep paying the price of
      that extra special care in future fsyncs done in the same transaction,
      unless the inode is used for another reflink operation or the full sync
      flag is set on it (truncate, failure to allocate extent maps for holes,
      and other exceptional and infrequent cases).
      
      So after we fsync an inode reset its last_unlink_trans to zero. In case
      another reflink happens, we continue to update the last_reflink_trans of
      the inode, just as before. Also set last_reflink_trans to the generation
      of the last transaction that modified the inode whenever we need to set
      the full sync flag on the inode, just like when we need to load an inode
      from disk after eviction.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      23e3337f
    • F
      btrfs: voluntarily relinquish cpu when doing a full fsync · 96acb375
      Filipe Manana 提交于
      Doing a full fsync may require processing many leaves of metadata, which
      can take some time and result in a task monopolizing a cpu for too long.
      So add a cond_resched() after processing a leaf when doing a full fsync,
      while not holding any locks on any tree (a subvolume or a log tree).
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      96acb375
    • F
      btrfs: hold on to less memory when logging checksums during full fsync · 5b7ce5e2
      Filipe Manana 提交于
      When doing a full fsync, at copy_items(), we iterate over all extents and
      then collect their checksums into a list. After copying all the extents to
      the log tree, we then log all the previously collected checksums.
      
      Before the previous patch in the series (subject "btrfs: stop copying old
      file extents when doing a full fsync"), we had to do it this way, because
      while we were iterating over the items in the leaf of the subvolume tree,
      we were holding a write lock on a leaf of the log tree, so logging the
      checksums for an extent right after we collected them could result in a
      deadlock, in case the checksum items ended up in the same leaf.
      
      However after the previous patch in the series we now do a first iteration
      over all the items in the leaf of the subvolume tree before locking a path
      in the log tree, so we can now log the checksums right after we have
      obtained them. This avoids holding in memory all checksums for all extents
      in the leaf while copying items from the source leaf to the log tree. The
      amount of memory used to hold all checksums of the extents in a leaf can
      be significant. For example if a leaf has 200 file extent items referring
      to 1M extents, using the default crc32c checksums, would result in using
      over 200K of memory (not accounting for the extra overhead of struct
      btrfs_ordered_sum), with smaller or less extents it would be less, but
      it could be much more with more extents per leaf and/or much larger
      extents.
      
      So change copy_items() to log the checksums for an extent after looking
      them up, and then free their memory, as they are no longer necessary.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      5b7ce5e2
    • F
      btrfs: stop copying old file extents when doing a full fsync · 7f30c072
      Filipe Manana 提交于
      When logging an inode in full sync mode, we go over every leaf that was
      modified in the current transaction and has items associated to our inode,
      and then copy all those items into the log tree. This includes copying
      file extent items that were created and added to the inode in past
      transactions, which is useless and only makes use more leaf space in the
      log tree.
      
      It's common to have a file with many file extent items spanning many
      leaves where only a few file extent items are new and need to be logged,
      and in such case we log all the file extent items we find in the modified
      leaves.
      
      So change the full sync behaviour to skip over file extent items that are
      not needed. Those are the ones that match the following criteria:
      
      1) Have a generation older than the current transaction and the inode
         was not a target of a reflink operation, as that can copy file extent
         items from a past generation from some other inode into our inode, so
         we have to log them;
      
      2) Start at an offset within i_size - we must log anything at or beyond
         i_size, otherwise we would lose prealloc extents after log replay.
      
      The following script exercises a scenario where this happens, and it's
      somehow close enough to what happened often on a SQL Server workload which
      I had to debug sometime ago to fix an issue where a pattern of writes to
      prealloc extents and fsync resulted in fsync failing with -EIO (that was
      commit ea7036de ("btrfs: fix fsync failure and transaction abort
      after writes to prealloc extents")). In that particular case, we had large
      files that had random writes and were often truncated, which made the
      next fsync be a full sync.
      
        $ cat test.sh
        #!/bin/bash
      
        DEV=/dev/sdi
        MNT=/mnt/sdi
      
        MKFS_OPTIONS="-O no-holes -R free-space-tree"
        MOUNT_OPTIONS="-o ssd"
      
        FILE_SIZE=$((1 * 1024 * 1024 * 1024)) # 1G
        # FILE_SIZE=$((2 * 1024 * 1024 * 1024)) # 2G
        # FILE_SIZE=$((512 * 1024 * 1024)) # 512M
      
        mkfs.btrfs -f $MKFS_OPTIONS $DEV
        mount $MOUNT_OPTIONS $DEV $MNT
      
        # Create a file with many extents. Use direct IO to make it faster
        # to create the file - using buffered IO we would have to fsync
        # after each write (terribly slow).
        echo "Creating file with $((FILE_SIZE / 4096)) extents of 4K each..."
        xfs_io -f -d -c "pwrite -b 4K 0 $FILE_SIZE" $MNT/foobar
      
        # Commit the transaction, so every extent after this is from an
        # old generation.
        sync
      
        # Now rewrite only a few extents, which are all far spread apart from
        # each other (e.g. 1G / 32M = 32 extents).
        # After this only a few extents have a new generation, while all other
        # ones have an old generation.
        echo "Rewriting $((FILE_SIZE / (32 * 1024 * 1024))) extents..."
        for ((i = 0; i < $FILE_SIZE; i += $((32 * 1024 * 1024)))); do
            xfs_io -c "pwrite $i 4K" $MNT/foobar >/dev/null
        done
      
        # Fsync, the inode logged in full sync mode since it was never fsynced
        # before.
        echo "Fsyncing file..."
        xfs_io -c "fsync" $MNT/foobar
      
        umount $MNT
      
      And the following bpftrace program was running when executing the test
      script:
      
        $ cat bpf-script.sh
        #!/usr/bin/bpftrace
      
        k:btrfs_log_inode
        {
            @start_log_inode[tid] = nsecs;
        }
      
        kr:btrfs_log_inode
        /@start_log_inode[tid]/
        {
            @log_inode_dur[tid] = (nsecs - @start_log_inode[tid]) / 1000;
            delete(@start_log_inode[tid]);
        }
      
        k:btrfs_sync_log
        {
            @start_sync_log[tid] = nsecs;
        }
      
        kr:btrfs_sync_log
        /@start_sync_log[tid]/
        {
            $sync_log_dur = (nsecs - @start_sync_log[tid]) / 1000;
            printf("btrfs_log_inode() took %llu us\n", @log_inode_dur[tid]);
            printf("btrfs_sync_log()  took %llu us\n", $sync_log_dur);
            delete(@start_sync_log[tid]);
            delete(@log_inode_dur[tid]);
            exit();
        }
      
      With 512M test file, before this patch:
      
        btrfs_log_inode() took 15218 us
        btrfs_sync_log()  took 1328 us
      
        Log tree has 17 leaves and 1 node, its total size is 294912 bytes.
      
      With 512M test file, after this patch:
      
        btrfs_log_inode() took 14760 us
        btrfs_sync_log()  took 588 us
      
        Log tree has a single leaf, its total size is 16K.
      
      With 1G test file, before this patch:
      
        btrfs_log_inode() took 27301 us
        btrfs_sync_log()  took 1767 us
      
        Log tree has 33 leaves and 1 node, its total size is 557056 bytes.
      
      With 1G test file, after this patch:
      
        btrfs_log_inode() took 26166 us
        btrfs_sync_log()  took 593 us
      
        Log tree has a single leaf, its total size is 16K
      
      With 2G test file, before this patch:
      
        btrfs_log_inode() took 50892 us
        btrfs_sync_log()  took 3127 us
      
        Log tree has 65 leaves and 1 node, its total size is 1081344 bytes.
      
      With 2G test file, after this patch:
      
        btrfs_log_inode() took 50126 us
        btrfs_sync_log()  took 586 us
      
        Log tree has a single leaf, its total size is 16K.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      7f30c072