1. 06 12月, 2016 3 次提交
  2. 30 11月, 2016 2 次提交
  3. 29 11月, 2016 1 次提交
  4. 11 10月, 2016 1 次提交
  5. 27 9月, 2016 5 次提交
    • A
      btrfs: fix a possible umount deadlock · 0ccd0528
      Anand Jain 提交于
      btrfs_show_devname() is using the device_list_mutex, sometimes
      a call to blkdev_put() leads vfs calling into this func. So
      call blkdev_put() outside of device_list_mutex, as of now.
      
      [  983.284212] ======================================================
      [  983.290401] [ INFO: possible circular locking dependency detected ]
      [  983.296677] 4.8.0-rc5-ceph-00023-g1b39cec2 #1 Not tainted
      [  983.302081] -------------------------------------------------------
      [  983.308357] umount/21720 is trying to acquire lock:
      [  983.313243]  (&bdev->bd_mutex){+.+.+.}, at: [<ffffffff9128ec51>] blkdev_put+0x31/0x150
      [  983.321264]
      [  983.321264] but task is already holding lock:
      [  983.327101]  (&fs_devs->device_list_mutex){+.+...}, at: [<ffffffffc033d6f6>] __btrfs_close_devices+0x46/0x200 [btrfs]
      [  983.337839]
      [  983.337839] which lock already depends on the new lock.
      [  983.337839]
      [  983.346024]
      [  983.346024] the existing dependency chain (in reverse order) is:
      [  983.353512]
      -> #4 (&fs_devs->device_list_mutex){+.+...}:
      [  983.359096]        [<ffffffff910dfd0c>] lock_acquire+0x1bc/0x1f0
      [  983.365143]        [<ffffffff91823125>] mutex_lock_nested+0x65/0x350
      [  983.371521]        [<ffffffffc02d8116>] btrfs_show_devname+0x36/0x1f0 [btrfs]
      [  983.378710]        [<ffffffff9129523e>] show_vfsmnt+0x4e/0x150
      [  983.384593]        [<ffffffff9126ffc7>] m_show+0x17/0x20
      [  983.389957]        [<ffffffff91276405>] seq_read+0x2b5/0x3b0
      [  983.395669]        [<ffffffff9124c808>] __vfs_read+0x28/0x100
      [  983.401464]        [<ffffffff9124eb3b>] vfs_read+0xab/0x150
      [  983.407080]        [<ffffffff9124ec32>] SyS_read+0x52/0xb0
      [  983.412609]        [<ffffffff91825fc0>] entry_SYSCALL_64_fastpath+0x23/0xc1
      [  983.419617]
      -> #3 (namespace_sem){++++++}:
      [  983.424024]        [<ffffffff910dfd0c>] lock_acquire+0x1bc/0x1f0
      [  983.430074]        [<ffffffff918239e9>] down_write+0x49/0x80
      [  983.435785]        [<ffffffff91272457>] lock_mount+0x67/0x1c0
      [  983.441582]        [<ffffffff91272ab2>] do_add_mount+0x32/0xf0
      [  983.447458]        [<ffffffff9127363a>] finish_automount+0x5a/0xc0
      [  983.453682]        [<ffffffff91259513>] follow_managed+0x1b3/0x2a0
      [  983.459912]        [<ffffffff9125b750>] lookup_fast+0x300/0x350
      [  983.465875]        [<ffffffff9125d6e7>] path_openat+0x3a7/0xaa0
      [  983.471846]        [<ffffffff9125ef75>] do_filp_open+0x85/0xe0
      [  983.477731]        [<ffffffff9124c41c>] do_sys_open+0x14c/0x1f0
      [  983.483702]        [<ffffffff9124c4de>] SyS_open+0x1e/0x20
      [  983.489240]        [<ffffffff91825fc0>] entry_SYSCALL_64_fastpath+0x23/0xc1
      [  983.496254]
      -> #2 (&sb->s_type->i_mutex_key#3){+.+.+.}:
      [  983.501798]        [<ffffffff910dfd0c>] lock_acquire+0x1bc/0x1f0
      [  983.507855]        [<ffffffff918239e9>] down_write+0x49/0x80
      [  983.513558]        [<ffffffff91366237>] start_creating+0x87/0x100
      [  983.519703]        [<ffffffff91366647>] debugfs_create_dir+0x17/0x100
      [  983.526195]        [<ffffffff911df153>] bdi_register+0x93/0x210
      [  983.532165]        [<ffffffff911df313>] bdi_register_owner+0x43/0x70
      [  983.538570]        [<ffffffff914080fb>] device_add_disk+0x1fb/0x450
      [  983.544888]        [<ffffffff91580226>] loop_add+0x1e6/0x290
      [  983.550596]        [<ffffffff91fec358>] loop_init+0x10b/0x14f
      [  983.556394]        [<ffffffff91002207>] do_one_initcall+0xa7/0x180
      [  983.562618]        [<ffffffff91f932e0>] kernel_init_freeable+0x1cc/0x266
      [  983.569370]        [<ffffffff918174be>] kernel_init+0xe/0x100
      [  983.575166]        [<ffffffff9182620f>] ret_from_fork+0x1f/0x40
      [  983.581131]
      -> #1 (loop_index_mutex){+.+.+.}:
      [  983.585801]        [<ffffffff910dfd0c>] lock_acquire+0x1bc/0x1f0
      [  983.591858]        [<ffffffff91823125>] mutex_lock_nested+0x65/0x350
      [  983.598256]        [<ffffffff9157ed3f>] lo_open+0x1f/0x60
      [  983.603704]        [<ffffffff9128eec3>] __blkdev_get+0x123/0x400
      [  983.609757]        [<ffffffff9128f4ea>] blkdev_get+0x34a/0x350
      [  983.615639]        [<ffffffff9128f554>] blkdev_open+0x64/0x80
      [  983.621428]        [<ffffffff9124aff6>] do_dentry_open+0x1c6/0x2d0
      [  983.627651]        [<ffffffff9124c029>] vfs_open+0x69/0x80
      [  983.633181]        [<ffffffff9125db74>] path_openat+0x834/0xaa0
      [  983.639152]        [<ffffffff9125ef75>] do_filp_open+0x85/0xe0
      [  983.645035]        [<ffffffff9124c41c>] do_sys_open+0x14c/0x1f0
      [  983.650999]        [<ffffffff9124c4de>] SyS_open+0x1e/0x20
      [  983.656535]        [<ffffffff91825fc0>] entry_SYSCALL_64_fastpath+0x23/0xc1
      [  983.663541]
      -> #0 (&bdev->bd_mutex){+.+.+.}:
      [  983.668107]        [<ffffffff910def43>] __lock_acquire+0x1003/0x17b0
      [  983.674510]        [<ffffffff910dfd0c>] lock_acquire+0x1bc/0x1f0
      [  983.680561]        [<ffffffff91823125>] mutex_lock_nested+0x65/0x350
      [  983.686967]        [<ffffffff9128ec51>] blkdev_put+0x31/0x150
      [  983.692761]        [<ffffffffc033481f>] btrfs_close_bdev+0x4f/0x60 [btrfs]
      [  983.699699]        [<ffffffffc033d77b>] __btrfs_close_devices+0xcb/0x200 [btrfs]
      [  983.707178]        [<ffffffffc033d8db>] btrfs_close_devices+0x2b/0xa0 [btrfs]
      [  983.714380]        [<ffffffffc03081c5>] close_ctree+0x265/0x340 [btrfs]
      [  983.721061]        [<ffffffffc02d7959>] btrfs_put_super+0x19/0x20 [btrfs]
      [  983.727908]        [<ffffffff91250e2f>] generic_shutdown_super+0x6f/0x100
      [  983.734744]        [<ffffffff91250f56>] kill_anon_super+0x16/0x30
      [  983.740888]        [<ffffffffc02da97e>] btrfs_kill_super+0x1e/0x130 [btrfs]
      [  983.747909]        [<ffffffff91250fe9>] deactivate_locked_super+0x49/0x80
      [  983.754745]        [<ffffffff912515fd>] deactivate_super+0x5d/0x70
      [  983.760977]        [<ffffffff91270a1c>] cleanup_mnt+0x5c/0x80
      [  983.766773]        [<ffffffff91270a92>] __cleanup_mnt+0x12/0x20
      [  983.772738]        [<ffffffff910aa2fe>] task_work_run+0x7e/0xc0
      [  983.778708]        [<ffffffff91081b5a>] exit_to_usermode_loop+0x7e/0xb4
      [  983.785373]        [<ffffffff910039eb>] syscall_return_slowpath+0xbb/0xd0
      [  983.792212]        [<ffffffff9182605c>] entry_SYSCALL_64_fastpath+0xbf/0xc1
      [  983.799225]
      [  983.799225] other info that might help us debug this:
      [  983.799225]
      [  983.807291] Chain exists of:
        &bdev->bd_mutex --> namespace_sem --> &fs_devs->device_list_mutex
      
      [  983.816521]  Possible unsafe locking scenario:
      [  983.816521]
      [  983.822489]        CPU0                    CPU1
      [  983.827043]        ----                    ----
      [  983.831599]   lock(&fs_devs->device_list_mutex);
      [  983.836289]                                lock(namespace_sem);
      [  983.842268]                                lock(&fs_devs->device_list_mutex);
      [  983.849478]   lock(&bdev->bd_mutex);
      [  983.853127]
      [  983.853127]  *** DEADLOCK ***
      [  983.853127]
      [  983.859113] 3 locks held by umount/21720:
      [  983.863145]  #0:  (&type->s_umount_key#35){++++..}, at: [<ffffffff912515f5>] deactivate_super+0x55/0x70
      [  983.872713]  #1:  (uuid_mutex){+.+.+.}, at: [<ffffffffc033d8d3>] btrfs_close_devices+0x23/0xa0 [btrfs]
      [  983.882206]  #2:  (&fs_devs->device_list_mutex){+.+...}, at: [<ffffffffc033d6f6>] __btrfs_close_devices+0x46/0x200 [btrfs]
      [  983.893422]
      [  983.893422] stack backtrace:
      [  983.897824] CPU: 6 PID: 21720 Comm: umount Not tainted 4.8.0-rc5-ceph-00023-g1b39cec2 #1
      [  983.905958] Hardware name: Supermicro SYS-5018R-WR/X10SRW-F, BIOS 1.0c 09/07/2015
      [  983.913492]  0000000000000000 ffff8c8a53c17a38 ffffffff91429521 ffffffff9260f4f0
      [  983.921018]  ffffffff92642760 ffff8c8a53c17a88 ffffffff911b2b04 0000000000000050
      [  983.928542]  ffffffff9237d620 ffff8c8a5294aee0 ffff8c8a5294aeb8 ffff8c8a5294aee0
      [  983.936072] Call Trace:
      [  983.938545]  [<ffffffff91429521>] dump_stack+0x85/0xc4
      [  983.943715]  [<ffffffff911b2b04>] print_circular_bug+0x1fb/0x20c
      [  983.949748]  [<ffffffff910def43>] __lock_acquire+0x1003/0x17b0
      [  983.955613]  [<ffffffff910dfd0c>] lock_acquire+0x1bc/0x1f0
      [  983.961123]  [<ffffffff9128ec51>] ? blkdev_put+0x31/0x150
      [  983.966550]  [<ffffffff91823125>] mutex_lock_nested+0x65/0x350
      [  983.972407]  [<ffffffff9128ec51>] ? blkdev_put+0x31/0x150
      [  983.977832]  [<ffffffff9128ec51>] blkdev_put+0x31/0x150
      [  983.983101]  [<ffffffffc033481f>] btrfs_close_bdev+0x4f/0x60 [btrfs]
      [  983.989500]  [<ffffffffc033d77b>] __btrfs_close_devices+0xcb/0x200 [btrfs]
      [  983.996415]  [<ffffffffc033d8db>] btrfs_close_devices+0x2b/0xa0 [btrfs]
      [  984.003068]  [<ffffffffc03081c5>] close_ctree+0x265/0x340 [btrfs]
      [  984.009189]  [<ffffffff9126cc5e>] ? evict_inodes+0x15e/0x170
      [  984.014881]  [<ffffffffc02d7959>] btrfs_put_super+0x19/0x20 [btrfs]
      [  984.021176]  [<ffffffff91250e2f>] generic_shutdown_super+0x6f/0x100
      [  984.027476]  [<ffffffff91250f56>] kill_anon_super+0x16/0x30
      [  984.033082]  [<ffffffffc02da97e>] btrfs_kill_super+0x1e/0x130 [btrfs]
      [  984.039548]  [<ffffffff91250fe9>] deactivate_locked_super+0x49/0x80
      [  984.045839]  [<ffffffff912515fd>] deactivate_super+0x5d/0x70
      [  984.051525]  [<ffffffff91270a1c>] cleanup_mnt+0x5c/0x80
      [  984.056774]  [<ffffffff91270a92>] __cleanup_mnt+0x12/0x20
      [  984.062201]  [<ffffffff910aa2fe>] task_work_run+0x7e/0xc0
      [  984.067625]  [<ffffffff91081b5a>] exit_to_usermode_loop+0x7e/0xb4
      [  984.073747]  [<ffffffff910039eb>] syscall_return_slowpath+0xbb/0xd0
      [  984.080038]  [<ffffffff9182605c>] entry_SYSCALL_64_fastpath+0xbf/0xc1
      Reported-by: NIlya Dryomov <idryomov@gmail.com>
      Signed-off-by: NAnand Jain <anand.jain@oracle.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      0ccd0528
    • J
      btrfs: convert pr_* to btrfs_* where possible · ab8d0fc4
      Jeff Mahoney 提交于
      For many printks, we want to know which file system issued the message.
      
      This patch converts most pr_* calls to use the btrfs_* versions instead.
      In some cases, this means adding plumbing to allow call sites access to
      an fs_info pointer.
      
      fs/btrfs/check-integrity.c is left alone for another day.
      Signed-off-by: NJeff Mahoney <jeffm@suse.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      ab8d0fc4
    • J
      btrfs: convert printk(KERN_* to use pr_* calls · 62e85577
      Jeff Mahoney 提交于
      This patch converts printk(KERN_* style messages to use the pr_* versions.
      
      One side effect is that anything that was KERN_DEBUG is now automatically
      a dynamic debug message.
      Signed-off-by: NJeff Mahoney <jeffm@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      62e85577
    • J
      btrfs: unsplit printed strings · 5d163e0e
      Jeff Mahoney 提交于
      CodingStyle chapter 2:
      "[...] never break user-visible strings such as printk messages,
      because that breaks the ability to grep for them."
      
      This patch unsplits user-visible strings.
      Signed-off-by: NJeff Mahoney <jeffm@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      5d163e0e
    • J
      btrfs: clean the old superblocks before freeing the device · cea67ab9
      Jeff Mahoney 提交于
      btrfs_rm_device frees the block device but then re-opens it using
      the saved device name.  A race exists between the close and the
      re-open that allows the block size to be changed.  The result
      is getting stuck forever in the reclaim loop in __getblk_slow.
      
      This patch moves the superblock cleanup before closing the block
      device, which is also consistent with other callers.  We also don't
      need a private copy of dev_name as the whole routine operates under
      the uuid_mutex.
      Signed-off-by: NJeff Mahoney <jeffm@suse.com>
      Reviewed-by: NDavid Sterba <dsterba@suse.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      cea67ab9
  6. 26 9月, 2016 2 次提交
    • J
      Btrfs: add a flags field to btrfs_fs_info · afcdd129
      Josef Bacik 提交于
      We have a lot of random ints in btrfs_fs_info that can be put into flags.  This
      is mostly equivalent with the exception of how we deal with quota going on or
      off, now instead we set a flag when we are turning it on or off and deal with
      that appropriately, rather than just having a pending state that the current
      quota_enabled gets set to.  Thanks,
      Signed-off-by: NJosef Bacik <jbacik@fb.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      afcdd129
    • N
      btrfs: let btrfs_delete_unused_bgs() to clean relocated bgs · 5d8eb6fe
      Naohiro Aota 提交于
      Currently, btrfs_relocate_chunk() is removing relocated BG by itself. But
      the work can be done by btrfs_delete_unused_bgs() (and it's better since it
      trim the BG). Let's dedupe the code.
      
      While btrfs_delete_unused_bgs() is already hitting the relocated BG, it
      skip the BG since the BG has "ro" flag set (to keep balancing BG intact).
      On the other hand, btrfs cannot drop "ro" flag here to prevent additional
      writes. So this patch make use of "removed" flag.
      btrfs_delete_unused_bgs() now detect the flag to distinguish whether a
      read-only BG is relocating or not.
      Signed-off-by: NNaohiro Aota <naohiro.aota@hgst.com>
      Reviewed-by: NJosef Bacik <jbacik@fb.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      5d8eb6fe
  7. 25 8月, 2016 1 次提交
  8. 08 8月, 2016 1 次提交
    • J
      block: rename bio bi_rw to bi_opf · 1eff9d32
      Jens Axboe 提交于
      Since commit 63a4cc24, bio->bi_rw contains flags in the lower
      portion and the op code in the higher portions. This means that
      old code that relies on manually setting bi_rw is most likely
      going to be broken. Instead of letting that brokeness linger,
      rename the member, to force old and out-of-tree code to break
      at compile time instead of at runtime.
      
      No intended functional changes in this commit.
      Signed-off-by: NJens Axboe <axboe@fb.com>
      1eff9d32
  9. 26 7月, 2016 7 次提交
  10. 19 7月, 2016 1 次提交
  11. 24 6月, 2016 1 次提交
  12. 18 6月, 2016 1 次提交
  13. 08 6月, 2016 4 次提交
  14. 06 6月, 2016 3 次提交
  15. 01 6月, 2016 1 次提交
  16. 30 5月, 2016 2 次提交
    • F
      Btrfs: fix race between device replace and chunk allocation · 22ab04e8
      Filipe Manana 提交于
      While iterating and copying extents from the source device, the device
      replace code keeps adjusting a left cursor that is used to make sure that
      once we finish processing a device extent, any future writes to extents
      from the corresponding block group will get into both the source and
      target devices. This left cursor is also used for resuming the device
      replace operation at mount time.
      
      However using this left cursor to decide whether writes go into both
      devices or only the source device is not enough to guarantee we don't
      miss copying extents into the target device. There are two cases where
      the current approach fails. The first one is related to when there are
      holes in the device and they get allocated for new block groups while
      the device replace operation is iterating the device extents (more on
      this explained below). The second one is that when that loop over the
      device extents finishes, we start dellaloc, wait for all ordered extents
      and then commit the current transaction, we might have got new block
      groups allocated that are now using a device extent that has an offset
      greater then or equals to the value of the left cursor, in which case
      writes to extents belonging to these new block groups will get issued
      only to the source device.
      
      For the first case where the current approach of using a left cursor
      fails, consider the source device currently has the following layout:
      
        [ extent bg A ] [ hole, unallocated space ] [extent bg B ]
        3Gb             4Gb                         5Gb
      
      While we are iterating the device extents from the source device using
      the commit root of the device tree, the following happens:
      
              CPU 1                                            CPU 2
      
                            <we are at transaction N>
      
        scrub_enumerate_chunks()
          --> searches the device tree for
              extents belonging to the source
              device using the device tree's
              commit root
          --> 1st iteration finds extent belonging to
              block group A
      
              --> sets block group A to RO mode
                  (btrfs_inc_block_group_ro)
      
              --> sets cursor left to found_key.offset
                  which is 3Gb
      
              --> scrub_chunk() starts
                  copies all allocated extents from
                  block group's A stripe at source
                  device into target device
      
                                                                 btrfs_alloc_chunk()
                                                                   --> allocates device extent
                                                                       in the range [4Gb, 5Gb[
                                                                       from the source device for
                                                                       a new block group C
      
                                                                 extent allocated from block
                                                                 group C for a direct IO,
                                                                 buffered write or btree node/leaf
      
                                                                 extent is written to, perhaps
                                                                 in response to a writepages()
                                                                 call from the VM or directly
                                                                 through direct IO
      
                                                                 the write is made only against
                                                                 the source device and not against
                                                                 the target device because the
                                                                 extent's offset is in the interval
                                                                 [4Gb, 5Gb[ which is larger then
                                                                 the value of cursor_left (3Gb)
      
              --> scrub_chunks() finishes
      
              --> updates left cursor from 3Gb to
                  4Gb
      
              --> btrfs_dec_block_group_ro() sets
                  block group A back to RW mode
      
                                   <we are still at transaction N>
      
          --> 2nd iteration finds extent belonging to
              block group B - it did not find the new
              extent in the range [4Gb, 5Gb[ for block
              group C because we are using the device
              tree's commit root or even because the
              block group's items are not all yet
              inserted in the respective btrees, that is,
              the block group is still attached to some
              transaction handle's new_bgs list and
              btrfs_create_pending_block_groups() was
              not called yet against that transaction
              handle, so the device extent items were
              not yet inserted into the devices tree
      
                                   <we are still at transaction N>
      
              --> so we end not copying anything from the newly
                  allocated device extent from the source device
                  to the target device
      
      So fix this by making __btrfs_map_block() always redirect writes to the
      target device as well, independently of the left cursor's value. With
      this change the left cursor is now used only for the purpose of tracking
      progress and allow a mount operation to resume a device replace.
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Reviewed-by: NJosef Bacik <jbacik@fb.com>
      22ab04e8
    • F
      Btrfs: fix race between device replace and block group removal · 57ba4cb8
      Filipe Manana 提交于
      When it's finishing, the device replace code iterates all extent maps
      representing block group and for each one that has a stripe that refers
      to the source device, it replaces its device with the target device.
      However when it replaces the source device with the target device it,
      the target device still has an ID of 0ULL (BTRFS_DEV_REPLACE_DEVID),
      only after its ID is changed to match the one from the source device.
      This leads to races with the chunk removal code that can temporarly see
      a device with an ID of 0ULL and then attempt to use that ID to remove
      items from the device tree and fail, causing a transaction abort:
      
      [ 9238.594364] BTRFS info (device sdf): dev_replace from /dev/sdf (devid 3) to /dev/sde finished
      [ 9238.594377] ------------[ cut here ]------------
      [ 9238.594402] WARNING: CPU: 14 PID: 21566 at fs/btrfs/volumes.c:2771 btrfs_remove_chunk+0x2e5/0x793 [btrfs]
      [ 9238.594403] BTRFS: Transaction aborted (error 1)
      [ 9238.594416] Modules linked in: btrfs crc32c_generic acpi_cpufreq xor tpm_tis tpm raid6_pq ppdev parport_pc processor psmouse parport i2c_piix4 evdev sg i2c_core se
      rio_raw pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache sr_mod cdrom sd_mod ata_generic virtio_scsi ata_piix virtio_pci libata virtio_ring virtio e1000 scsi_mod fl
      oppy [last unloaded: btrfs]
      [ 9238.594418] CPU: 14 PID: 21566 Comm: btrfs-cleaner Not tainted 4.6.0-rc7-btrfs-next-29+ #1
      [ 9238.594419] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014
      [ 9238.594421]  0000000000000000 ffff88017f1dbc60 ffffffff8126b42c ffff88017f1dbcb0
      [ 9238.594422]  0000000000000000 ffff88017f1dbca0 ffffffff81052b14 00000ad37f1dbd18
      [ 9238.594423]  0000000000000001 ffff88018068a558 ffff88005c4b9c00 ffff880233f60db0
      [ 9238.594424] Call Trace:
      [ 9238.594428]  [<ffffffff8126b42c>] dump_stack+0x67/0x90
      [ 9238.594430]  [<ffffffff81052b14>] __warn+0xc2/0xdd
      [ 9238.594432]  [<ffffffff81052b7a>] warn_slowpath_fmt+0x4b/0x53
      [ 9238.594434]  [<ffffffff8116c311>] ? kmem_cache_free+0x128/0x188
      [ 9238.594450]  [<ffffffffa04d43f5>] btrfs_remove_chunk+0x2e5/0x793 [btrfs]
      [ 9238.594452]  [<ffffffff8108e456>] ? arch_local_irq_save+0x9/0xc
      [ 9238.594464]  [<ffffffffa04a26fa>] btrfs_delete_unused_bgs+0x317/0x382 [btrfs]
      [ 9238.594476]  [<ffffffffa04a961d>] cleaner_kthread+0x1ad/0x1c7 [btrfs]
      [ 9238.594489]  [<ffffffffa04a9470>] ? btree_invalidatepage+0x8e/0x8e [btrfs]
      [ 9238.594490]  [<ffffffff8106f403>] kthread+0xd4/0xdc
      [ 9238.594494]  [<ffffffff8149e242>] ret_from_fork+0x22/0x40
      [ 9238.594495]  [<ffffffff8106f32f>] ? kthread_stop+0x286/0x286
      [ 9238.594496] ---[ end trace 183efbe50275f059 ]---
      
      The sequence of steps leading to this is like the following:
      
                    CPU 1                                           CPU 2
      
       btrfs_dev_replace_finishing()
      
         at this point
         dev_replace->tgtdev->devid ==
         BTRFS_DEV_REPLACE_DEVID (0ULL)
      
         ...
      
         btrfs_start_transaction()
         btrfs_commit_transaction()
      
                                                           btrfs_delete_unused_bgs()
                                                             btrfs_remove_chunk()
      
                                                               looks up for the extent map
                                                               corresponding to the chunk
      
                                                               lock_chunks() (chunk_mutex)
                                                               check_system_chunk()
                                                               unlock_chunks() (chunk_mutex)
      
         locks fs_info->chunk_mutex
      
         btrfs_dev_replace_update_device_in_mapping_tree()
           --> iterates fs_info->mapping_tree and
               replaces the device in every extent
               map's map->stripes[] with
               dev_replace->tgtdev, which still has
               an id of 0ULL (BTRFS_DEV_REPLACE_DEVID)
      
                                                               iterates over all stripes from
                                                               the extent map
      
                                                                 --> calls btrfs_free_dev_extent()
                                                                     passing it the target device
                                                                     that still has an ID of 0ULL
      
                                                                 --> btrfs_free_dev_extent() fails
                                                                   --> aborts current transaction
      
         finishes setting up the target device,
         namely it sets tgtdev->devid to the value
         of srcdev->devid (which is necessarily > 0)
      
         frees the srcdev
      
         unlocks fs_info->chunk_mutex
      
      So fix this by taking the device list mutex while processing the stripes
      for the chunk's extent map. This is similar to the race between device
      replace and block group creation that was fixed by commit 50460e37
      ("Btrfs: fix race when finishing dev replace leading to transaction abort").
      Signed-off-by: NFilipe Manana <fdmanana@suse.com>
      Reviewed-by: NJosef Bacik <jbacik@fb.com>
      57ba4cb8
  17. 26 5月, 2016 2 次提交
  18. 21 5月, 2016 1 次提交
  19. 06 5月, 2016 1 次提交
    • A
      btrfs: fix lock dep warning move scratch super outside of chunk_mutex · 48b3b9d4
      Anand Jain 提交于
      Move scratch super outside of the chunk lock to avoid below
      lockdep warning. The better place to scratch super is in
      the function btrfs_rm_dev_replace_free_srcdev() just before
      free_device, which is outside of the chunk lock as well.
      
      To reproduce:
        (fresh boot)
        mkfs.btrfs -f -draid5 -mraid5 /dev/sdc /dev/sdd /dev/sde
        mount /dev/sdc /btrfs
        dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100
        (get devmgt from https://github.com/asj/devmgt.git)
        devmgt detach /dev/sde
        dd if=/dev/zero of=/btrfs/tf1 bs=4096 count=100
        sync
        btrfs replace start -Brf 3 /dev/sdf /btrfs <--
        devmgt attach host7
      
      ======================================================
      [ INFO: possible circular locking dependency detected ]
      4.6.0-rc2asj+ #1 Not tainted
      ---------------------------------------------------
      
      btrfs/2174 is trying to acquire lock:
      (sb_writers){.+.+.+}, at:
      [<ffffffff812449b4>] __sb_start_write+0xb4/0xf0
      
      but task is already holding lock:
      (&fs_info->chunk_mutex){+.+.+.}, at:
      [<ffffffffa05c5f55>] btrfs_dev_replace_finishing+0x145/0x980 [btrfs]
      
      which lock already depends on the new lock.
      
      Chain exists of:
      sb_writers --> &fs_devs->device_list_mutex --> &fs_info->chunk_mutex
      Possible unsafe locking scenario:
      CPU0				CPU1
      ----				----
      lock(&fs_info->chunk_mutex);
      				lock(&fs_devs->device_list_mutex);
      				lock(&fs_info->chunk_mutex);
      lock(sb_writers);
      
      *** DEADLOCK ***
      
      -> #0 (sb_writers){.+.+.+}:
      [<ffffffff810e6415>] __lock_acquire+0x1bc5/0x1ee0
      [<ffffffff810e707e>] lock_acquire+0xbe/0x210
      [<ffffffff810df49a>] percpu_down_read+0x4a/0xa0
      [<ffffffff812449b4>] __sb_start_write+0xb4/0xf0
      [<ffffffff81265534>] mnt_want_write+0x24/0x50
      [<ffffffff812508a2>] path_openat+0x952/0x1190
      [<ffffffff81252451>] do_filp_open+0x91/0x100
      [<ffffffff8123f5cc>] file_open_name+0xfc/0x140
      [<ffffffff8123f643>] filp_open+0x33/0x60
      [<ffffffffa0572bb6>] update_dev_time+0x16/0x40 [btrfs]
      [<ffffffffa057f60d>] btrfs_scratch_superblocks+0x5d/0xb0 [btrfs]
      [<ffffffffa057f70e>] btrfs_rm_dev_replace_remove_srcdev+0xae/0xd0 [btrfs]
      [<ffffffffa05c62c5>] btrfs_dev_replace_finishing+0x4b5/0x980 [btrfs]
      [<ffffffffa05c6ae8>] btrfs_dev_replace_start+0x358/0x530 [btrfs]
      Signed-off-by: NAnand Jain <anand.jain@oracle.com>
      Signed-off-by: NDavid Sterba <dsterba@suse.com>
      48b3b9d4