• O
    drm/amdkfd: Fix a circular lock dependency · 6a6ef5ee
    Oak Zeng 提交于
    The idea to break the circular lock dependency is to temporarily drop
    dqm lock before calling allocate_mqd. See callstack #1 below.
    
    [   59.510149] [drm] Initialized amdgpu 3.30.0 20150101 for 0000:04:00.0 on minor 0
    
    [  513.604034] ======================================================
    [  513.604205] WARNING: possible circular locking dependency detected
    [  513.604375] 4.18.0-kfd-root #2 Tainted: G        W
    [  513.604530] ------------------------------------------------------
    [  513.604699] kswapd0/611 is trying to acquire lock:
    [  513.604840] 00000000d254022e (&dqm->lock_hidden){+.+.}, at: evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.605150]
                   but task is already holding lock:
    [  513.605307] 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
    [  513.605540]
                   which lock already depends on the new lock.
    
    [  513.605747]
                   the existing dependency chain (in reverse order) is:
    [  513.605944]
                   -> #4 (&anon_vma->rwsem){++++}:
    [  513.606106]        __vma_adjust+0x147/0x7f0
    [  513.606231]        __split_vma+0x179/0x190
    [  513.606353]        mprotect_fixup+0x217/0x260
    [  513.606553]        do_mprotect_pkey+0x211/0x380
    [  513.606752]        __x64_sys_mprotect+0x1b/0x20
    [  513.606954]        do_syscall_64+0x50/0x1a0
    [  513.607149]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [  513.607380]
                   -> #3 (&mapping->i_mmap_rwsem){++++}:
    [  513.607678]        rmap_walk_file+0x1f0/0x280
    [  513.607887]        page_referenced+0xdd/0x180
    [  513.608081]        shrink_page_list+0x853/0xcb0
    [  513.608279]        shrink_inactive_list+0x33b/0x700
    [  513.608483]        shrink_node_memcg+0x37a/0x7f0
    [  513.608682]        shrink_node+0xd8/0x490
    [  513.608869]        balance_pgdat+0x18b/0x3b0
    [  513.609062]        kswapd+0x203/0x5c0
    [  513.609241]        kthread+0x100/0x140
    [  513.609420]        ret_from_fork+0x24/0x30
    [  513.609607]
                   -> #2 (fs_reclaim){+.+.}:
    [  513.609883]        kmem_cache_alloc_trace+0x34/0x2e0
    [  513.610093]        reservation_object_reserve_shared+0x139/0x300
    [  513.610326]        ttm_bo_init_reserved+0x291/0x480 [ttm]
    [  513.610567]        amdgpu_bo_do_create+0x1d2/0x650 [amdgpu]
    [  513.610811]        amdgpu_bo_create+0x40/0x1f0 [amdgpu]
    [  513.611041]        amdgpu_bo_create_reserved+0x249/0x2d0 [amdgpu]
    [  513.611290]        amdgpu_bo_create_kernel+0x12/0x70 [amdgpu]
    [  513.611584]        amdgpu_ttm_init+0x2cb/0x560 [amdgpu]
    [  513.611823]        gmc_v9_0_sw_init+0x400/0x750 [amdgpu]
    [  513.612491]        amdgpu_device_init+0x14eb/0x1990 [amdgpu]
    [  513.612730]        amdgpu_driver_load_kms+0x78/0x290 [amdgpu]
    [  513.612958]        drm_dev_register+0x111/0x1a0
    [  513.613171]        amdgpu_pci_probe+0x11c/0x1e0 [amdgpu]
    [  513.613389]        local_pci_probe+0x3f/0x90
    [  513.613581]        pci_device_probe+0x102/0x1c0
    [  513.613779]        driver_probe_device+0x2a7/0x480
    [  513.613984]        __driver_attach+0x10a/0x110
    [  513.614179]        bus_for_each_dev+0x67/0xc0
    [  513.614372]        bus_add_driver+0x1eb/0x260
    [  513.614565]        driver_register+0x5b/0xe0
    [  513.614756]        do_one_initcall+0xac/0x357
    [  513.614952]        do_init_module+0x5b/0x213
    [  513.615145]        load_module+0x2542/0x2d30
    [  513.615337]        __do_sys_finit_module+0xd2/0x100
    [  513.615541]        do_syscall_64+0x50/0x1a0
    [  513.615731]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [  513.615963]
                   -> #1 (reservation_ww_class_mutex){+.+.}:
    [  513.616293]        amdgpu_amdkfd_alloc_gtt_mem+0xcf/0x2c0 [amdgpu]
    [  513.616554]        init_mqd+0x223/0x260 [amdgpu]
    [  513.616779]        create_queue_nocpsch+0x4d9/0x600 [amdgpu]
    [  513.617031]        pqm_create_queue+0x37c/0x520 [amdgpu]
    [  513.617270]        kfd_ioctl_create_queue+0x2f9/0x650 [amdgpu]
    [  513.617522]        kfd_ioctl+0x202/0x350 [amdgpu]
    [  513.617724]        do_vfs_ioctl+0x9f/0x6c0
    [  513.617914]        ksys_ioctl+0x66/0x70
    [  513.618095]        __x64_sys_ioctl+0x16/0x20
    [  513.618286]        do_syscall_64+0x50/0x1a0
    [  513.618476]        entry_SYSCALL_64_after_hwframe+0x49/0xbe
    [  513.618695]
                   -> #0 (&dqm->lock_hidden){+.+.}:
    [  513.618984]        __mutex_lock+0x98/0x970
    [  513.619197]        evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.619459]        kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
    [  513.619710]        kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
    [  513.620103]        amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
    [  513.620363]        amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
    [  513.620614]        __mmu_notifier_invalidate_range_start+0x70/0xb0
    [  513.620851]        try_to_unmap_one+0x7fc/0x8f0
    [  513.621049]        rmap_walk_anon+0x121/0x290
    [  513.621242]        try_to_unmap+0x93/0xf0
    [  513.621428]        shrink_page_list+0x606/0xcb0
    [  513.621625]        shrink_inactive_list+0x33b/0x700
    [  513.621835]        shrink_node_memcg+0x37a/0x7f0
    [  513.622034]        shrink_node+0xd8/0x490
    [  513.622219]        balance_pgdat+0x18b/0x3b0
    [  513.622410]        kswapd+0x203/0x5c0
    [  513.622589]        kthread+0x100/0x140
    [  513.622769]        ret_from_fork+0x24/0x30
    [  513.622957]
                   other info that might help us debug this:
    
    [  513.623354] Chain exists of:
                     &dqm->lock_hidden --> &mapping->i_mmap_rwsem --> &anon_vma->rwsem
    
    [  513.623900]  Possible unsafe locking scenario:
    
    [  513.624189]        CPU0                    CPU1
    [  513.624397]        ----                    ----
    [  513.624594]   lock(&anon_vma->rwsem);
    [  513.624771]                                lock(&mapping->i_mmap_rwsem);
    [  513.625020]                                lock(&anon_vma->rwsem);
    [  513.625253]   lock(&dqm->lock_hidden);
    [  513.625433]
                    *** DEADLOCK ***
    
    [  513.625783] 3 locks held by kswapd0/611:
    [  513.625967]  #0: 00000000f14edf84 (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30
    [  513.626309]  #1: 00000000961547fc (&anon_vma->rwsem){++++}, at: page_lock_anon_vma_read+0xe4/0x250
    [  513.626671]  #2: 0000000067b5cd12 (srcu){....}, at: __mmu_notifier_invalidate_range_start+0x5/0xb0
    [  513.627037]
                   stack backtrace:
    [  513.627292] CPU: 0 PID: 611 Comm: kswapd0 Tainted: G        W         4.18.0-kfd-root #2
    [  513.627632] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
    [  513.627990] Call Trace:
    [  513.628143]  dump_stack+0x7c/0xbb
    [  513.628315]  print_circular_bug.isra.37+0x21b/0x228
    [  513.628581]  __lock_acquire+0xf7d/0x1470
    [  513.628782]  ? unwind_next_frame+0x6c/0x4f0
    [  513.628974]  ? lock_acquire+0xec/0x1e0
    [  513.629154]  lock_acquire+0xec/0x1e0
    [  513.629357]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.629587]  __mutex_lock+0x98/0x970
    [  513.629790]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.630047]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.630309]  ? evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.630562]  evict_process_queues_nocpsch+0x26/0x140 [amdgpu]
    [  513.630816]  kfd_process_evict_queues+0x3b/0xb0 [amdgpu]
    [  513.631057]  kgd2kfd_quiesce_mm+0x1c/0x40 [amdgpu]
    [  513.631288]  amdgpu_amdkfd_evict_userptr+0x38/0x70 [amdgpu]
    [  513.631536]  amdgpu_mn_invalidate_range_start_hsa+0xa6/0xc0 [amdgpu]
    [  513.632076]  __mmu_notifier_invalidate_range_start+0x70/0xb0
    [  513.632299]  try_to_unmap_one+0x7fc/0x8f0
    [  513.632487]  ? page_lock_anon_vma_read+0x68/0x250
    [  513.632690]  rmap_walk_anon+0x121/0x290
    [  513.632875]  try_to_unmap+0x93/0xf0
    [  513.633050]  ? page_remove_rmap+0x330/0x330
    [  513.633239]  ? rcu_read_unlock+0x60/0x60
    [  513.633422]  ? page_get_anon_vma+0x160/0x160
    [  513.633613]  shrink_page_list+0x606/0xcb0
    [  513.633800]  shrink_inactive_list+0x33b/0x700
    [  513.633997]  shrink_node_memcg+0x37a/0x7f0
    [  513.634186]  ? shrink_node+0xd8/0x490
    [  513.634363]  shrink_node+0xd8/0x490
    [  513.634537]  balance_pgdat+0x18b/0x3b0
    [  513.634718]  kswapd+0x203/0x5c0
    [  513.634887]  ? wait_woken+0xb0/0xb0
    [  513.635062]  kthread+0x100/0x140
    [  513.635231]  ? balance_pgdat+0x3b0/0x3b0
    [  513.635414]  ? kthread_delayed_work_timer_fn+0x80/0x80
    [  513.635626]  ret_from_fork+0x24/0x30
    [  513.636042] Evicting PASID 32768 queues
    [  513.936236] Restoring PASID 32768 queues
    [  524.708912] Evicting PASID 32768 queues
    [  524.999875] Restoring PASID 32768 queues
    Signed-off-by: NOak Zeng <Oak.Zeng@amd.com>
    Reviewed-by: NPhilip Yang <philip.yang@amd.com>
    Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
    6a6ef5ee
kfd_device_queue_manager.c 49.7 KB