提交 c7f21978 编写于 作者: P Philip Yang 提交者: Alex Deucher

drm/amdkfd: Add user queue eviction restore SMI event

Output user queue eviction and restore event. User queue eviction may be
triggered by svm or userptr MMU notifier, TTM eviction, device suspend
and CRIU checkpoint and restore.

User queue restore may be rescheduled if eviction happens again while
restore.
Signed-off-by: NPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: NFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 acac270d
...@@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo) ...@@ -336,7 +336,7 @@ void amdgpu_amdkfd_release_notify(struct amdgpu_bo *bo)
} }
#endif #endif
/* KGD2KFD callbacks */ /* KGD2KFD callbacks */
int kgd2kfd_quiesce_mm(struct mm_struct *mm); int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger);
int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence); struct dma_fence *fence);
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "amdgpu_dma_buf.h" #include "amdgpu_dma_buf.h"
#include <uapi/linux/kfd_ioctl.h> #include <uapi/linux/kfd_ioctl.h>
#include "amdgpu_xgmi.h" #include "amdgpu_xgmi.h"
#include "kfd_smi_events.h"
/* Userptr restore delay, just long enough to allow consecutive VM /* Userptr restore delay, just long enough to allow consecutive VM
* changes to accumulate * changes to accumulate
...@@ -2346,7 +2347,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem, ...@@ -2346,7 +2347,7 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
evicted_bos = atomic_inc_return(&process_info->evicted_bos); evicted_bos = atomic_inc_return(&process_info->evicted_bos);
if (evicted_bos == 1) { if (evicted_bos == 1) {
/* First eviction, stop the queues */ /* First eviction, stop the queues */
r = kgd2kfd_quiesce_mm(mm); r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
if (r) if (r)
pr_err("Failed to quiesce KFD\n"); pr_err("Failed to quiesce KFD\n");
schedule_delayed_work(&process_info->restore_userptr_work, schedule_delayed_work(&process_info->restore_userptr_work,
...@@ -2620,13 +2621,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work) ...@@ -2620,13 +2621,16 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
unlock_out: unlock_out:
mutex_unlock(&process_info->lock); mutex_unlock(&process_info->lock);
mmput(mm);
put_task_struct(usertask);
/* If validation failed, reschedule another attempt */ /* If validation failed, reschedule another attempt */
if (evicted_bos) if (evicted_bos) {
schedule_delayed_work(&process_info->restore_userptr_work, schedule_delayed_work(&process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS)); msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
kfd_smi_event_queue_restore_rescheduled(mm);
}
mmput(mm);
put_task_struct(usertask);
} }
/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given
......
...@@ -2434,7 +2434,7 @@ static int criu_restore(struct file *filep, ...@@ -2434,7 +2434,7 @@ static int criu_restore(struct file *filep,
* Set the process to evicted state to avoid running any new queues before all the memory * Set the process to evicted state to avoid running any new queues before all the memory
* mappings are ready. * mappings are ready.
*/ */
ret = kfd_process_evict_queues(p); ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
if (ret) if (ret)
goto exit_unlock; goto exit_unlock;
...@@ -2553,7 +2553,7 @@ static int criu_process_info(struct file *filep, ...@@ -2553,7 +2553,7 @@ static int criu_process_info(struct file *filep,
goto err_unlock; goto err_unlock;
} }
ret = kfd_process_evict_queues(p); ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
if (ret) if (ret)
goto err_unlock; goto err_unlock;
......
...@@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) ...@@ -837,7 +837,7 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
spin_unlock_irqrestore(&kfd->interrupt_lock, flags); spin_unlock_irqrestore(&kfd->interrupt_lock, flags);
} }
int kgd2kfd_quiesce_mm(struct mm_struct *mm) int kgd2kfd_quiesce_mm(struct mm_struct *mm, uint32_t trigger)
{ {
struct kfd_process *p; struct kfd_process *p;
int r; int r;
...@@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) ...@@ -851,7 +851,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
return -ESRCH; return -ESRCH;
WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
r = kfd_process_evict_queues(p); r = kfd_process_evict_queues(p, trigger);
kfd_unref_process(p); kfd_unref_process(p);
return r; return r;
......
...@@ -947,7 +947,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx( ...@@ -947,7 +947,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
} }
void kfd_unref_process(struct kfd_process *p); void kfd_unref_process(struct kfd_process *p);
int kfd_process_evict_queues(struct kfd_process *p); int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger);
int kfd_process_restore_queues(struct kfd_process *p); int kfd_process_restore_queues(struct kfd_process *p);
void kfd_suspend_all_processes(void); void kfd_suspend_all_processes(void);
int kfd_resume_all_processes(void); int kfd_resume_all_processes(void);
......
...@@ -43,6 +43,7 @@ struct mm_struct; ...@@ -43,6 +43,7 @@ struct mm_struct;
#include "kfd_device_queue_manager.h" #include "kfd_device_queue_manager.h"
#include "kfd_iommu.h" #include "kfd_iommu.h"
#include "kfd_svm.h" #include "kfd_svm.h"
#include "kfd_smi_events.h"
/* /*
* List of struct kfd_process (field kfd_process). * List of struct kfd_process (field kfd_process).
...@@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm) ...@@ -1736,7 +1737,7 @@ struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct *mm)
* Eviction is reference-counted per process-device. This means multiple * Eviction is reference-counted per process-device. This means multiple
* evictions from different sources can be nested safely. * evictions from different sources can be nested safely.
*/ */
int kfd_process_evict_queues(struct kfd_process *p) int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
{ {
int r = 0; int r = 0;
int i; int i;
...@@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p) ...@@ -1745,6 +1746,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
for (i = 0; i < p->n_pdds; i++) { for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i]; struct kfd_process_device *pdd = p->pdds[i];
kfd_smi_event_queue_eviction(pdd->dev, p->lead_thread->pid,
trigger);
r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm, r = pdd->dev->dqm->ops.evict_process_queues(pdd->dev->dqm,
&pdd->qpd); &pdd->qpd);
/* evict return -EIO if HWS is hang or asic is resetting, in this case /* evict return -EIO if HWS is hang or asic is resetting, in this case
...@@ -1769,6 +1773,9 @@ int kfd_process_evict_queues(struct kfd_process *p) ...@@ -1769,6 +1773,9 @@ int kfd_process_evict_queues(struct kfd_process *p)
if (n_evicted == 0) if (n_evicted == 0)
break; break;
kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, if (pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
&pdd->qpd)) &pdd->qpd))
pr_err("Failed to restore queues\n"); pr_err("Failed to restore queues\n");
...@@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p) ...@@ -1788,6 +1795,8 @@ int kfd_process_restore_queues(struct kfd_process *p)
for (i = 0; i < p->n_pdds; i++) { for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i]; struct kfd_process_device *pdd = p->pdds[i];
kfd_smi_event_queue_restore(pdd->dev, p->lead_thread->pid);
r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm, r = pdd->dev->dqm->ops.restore_process_queues(pdd->dev->dqm,
&pdd->qpd); &pdd->qpd);
if (r) { if (r) {
...@@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work) ...@@ -1849,7 +1858,7 @@ static void evict_process_worker(struct work_struct *work)
flush_delayed_work(&p->restore_work); flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid 0x%x\n", p->pasid); pr_debug("Started evicting pasid 0x%x\n", p->pasid);
ret = kfd_process_evict_queues(p); ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) { if (!ret) {
dma_fence_signal(p->ef); dma_fence_signal(p->ef);
dma_fence_put(p->ef); dma_fence_put(p->ef);
...@@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void) ...@@ -1916,7 +1925,7 @@ void kfd_suspend_all_processes(void)
cancel_delayed_work_sync(&p->eviction_work); cancel_delayed_work_sync(&p->eviction_work);
cancel_delayed_work_sync(&p->restore_work); cancel_delayed_work_sync(&p->restore_work);
if (kfd_process_evict_queues(p)) if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid); pr_err("Failed to suspend process 0x%x\n", p->pasid);
dma_fence_signal(p->ef); dma_fence_signal(p->ef);
dma_fence_put(p->ef); dma_fence_put(p->ef);
......
...@@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid, ...@@ -283,6 +283,41 @@ void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
from, to, trigger); from, to, trigger);
} }
void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
uint32_t trigger)
{
kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_EVICTION,
"%lld -%d %x %d\n", ktime_get_boottime_ns(), pid,
dev->id, trigger);
}
void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid)
{
kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_QUEUE_RESTORE,
"%lld -%d %x\n", ktime_get_boottime_ns(), pid,
dev->id);
}
void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm)
{
struct kfd_process *p;
int i;
p = kfd_lookup_process_by_mm(mm);
if (!p)
return;
for (i = 0; i < p->n_pdds; i++) {
struct kfd_process_device *pdd = p->pdds[i];
kfd_smi_event_add(p->lead_thread->pid, pdd->dev,
KFD_SMI_EVENT_QUEUE_RESTORE,
"%lld -%d %x %c\n", ktime_get_boottime_ns(),
p->lead_thread->pid, pdd->dev->id, 'R');
}
kfd_unref_process(p);
}
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
{ {
struct kfd_smi_client *client; struct kfd_smi_client *client;
......
...@@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid, ...@@ -42,4 +42,8 @@ void kfd_smi_event_migration_start(struct kfd_dev *dev, pid_t pid,
void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid, void kfd_smi_event_migration_end(struct kfd_dev *dev, pid_t pid,
unsigned long start, unsigned long end, unsigned long start, unsigned long end,
uint32_t from, uint32_t to, uint32_t trigger); uint32_t from, uint32_t to, uint32_t trigger);
void kfd_smi_event_queue_eviction(struct kfd_dev *dev, pid_t pid,
uint32_t trigger);
void kfd_smi_event_queue_restore(struct kfd_dev *dev, pid_t pid);
void kfd_smi_event_queue_restore_rescheduled(struct mm_struct *mm);
#endif #endif
...@@ -1730,14 +1730,16 @@ static void svm_range_restore_work(struct work_struct *work) ...@@ -1730,14 +1730,16 @@ static void svm_range_restore_work(struct work_struct *work)
mutex_unlock(&svms->lock); mutex_unlock(&svms->lock);
mmap_write_unlock(mm); mmap_write_unlock(mm);
mutex_unlock(&process_info->lock); mutex_unlock(&process_info->lock);
mmput(mm);
/* If validation failed, reschedule another attempt */ /* If validation failed, reschedule another attempt */
if (evicted_ranges) { if (evicted_ranges) {
pr_debug("reschedule to restore svm range\n"); pr_debug("reschedule to restore svm range\n");
schedule_delayed_work(&svms->restore_work, schedule_delayed_work(&svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
kfd_smi_event_queue_restore_rescheduled(mm);
} }
mmput(mm);
} }
/** /**
...@@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm, ...@@ -1793,7 +1795,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
prange->svms, prange->start, prange->last); prange->svms, prange->start, prange->last);
/* First eviction, stop the queues */ /* First eviction, stop the queues */
r = kgd2kfd_quiesce_mm(mm); r = kgd2kfd_quiesce_mm(mm, KFD_QUEUE_EVICTION_TRIGGER_SVM);
if (r) if (r)
pr_debug("failed to quiesce KFD\n"); pr_debug("failed to quiesce KFD\n");
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册