diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index a0ea663ecdbc9f228fd220480d802260651ffbe8..92790db5edc026838c85e5fd5722fe30fae547d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -789,4 +789,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) { } + +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) +{ +} #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index ffe149aafc39330f07c57cff47ace2aa5dd22e35..a10507ecb75082f531f5e6c53808067ca4944afa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -270,5 +270,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm); int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, struct dma_fence *fence); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask); #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 4bfedaab183f0ee6a316517a7db30a68d69ac913..d5e790f046b4bc297a2256f1f2861ae847b3a6f5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -29,6 +29,7 @@ #include "cwsr_trap_handler.h" #include "kfd_iommu.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" #define MQD_SIZE_ALIGNED 768 @@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask) +{ + if (kfd) + kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c index 7b348bf9df2140ed43a69a1d45aee2d1f2176123..86c2c3e97944b6581e7f14f2fcd9e3448f10a4fc 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -24,6 +24,7 @@ #include #include #include +#include "amdgpu.h" #include "amdgpu_vm.h" #include "kfd_priv.h" #include "kfd_smi_events.h" @@ -148,6 +149,54 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) return 0; } +static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event, + char *event_msg, int len) +{ + struct kfd_smi_client *client; + + rcu_read_lock(); + + list_for_each_entry_rcu(client, &dev->smi_clients, list) { + if (!(READ_ONCE(client->events) & smi_event)) + continue; + spin_lock(&client->lock); + if (kfifo_avail(&client->fifo) >= len) { + kfifo_in(&client->fifo, event_msg, len); + wake_up_all(&client->wait_queue); + } else { + pr_debug("smi_event(EventID: %llu): no space left\n", + smi_event); + } + spin_unlock(&client->lock); + } + + rcu_read_unlock(); +} + +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, + uint32_t throttle_bitmask) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; + /* + * ThermalThrottle msg = throttle_bitmask(8): + * thermal_interrupt_count(16): + * 16 bytes event + 1 byte space + 8 byte throttle_bitmask + + * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n + + * 1 byte \0 = 44 + */ + char fifo_in[44]; + int len; + + if (list_empty(&dev->smi_clients)) + return; + + len = snprintf(fifo_in, 44, "%x %x:%llx\n", + KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask, + atomic64_read(&adev->smu.throttle_int_counter)); + + add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len); +} + void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) { struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; @@ -156,7 +205,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 */ char fifo_in[43]; - struct kfd_smi_client *client; int len; if (list_empty(&dev->smi_clients)) @@ -171,22 +219,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, task_info.pid, task_info.task_name); - rcu_read_lock(); - - list_for_each_entry_rcu(client, &dev->smi_clients, list) { - if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT)) - continue; - spin_lock(&client->lock); - if (kfifo_avail(&client->fifo) >= len) { - kfifo_in(&client->fifo, fifo_in, len); - wake_up_all(&client->wait_queue); - } - else - pr_debug("smi_event(vmfault): no space left\n"); - spin_unlock(&client->lock); - } - - rcu_read_unlock(); + add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len); } int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h index a9cb218fef96eaad15721aa35d9e85b60f8f1d46..15537b2cccb5868852199ed7bf2e85975dd5870e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -25,5 +25,7 @@ int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); +void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, + uint32_t throttle_bitmask); #endif diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 0eeccf3341a32df5ece205a1c8aec154be32789d..7d9c40ad57801fbe8a50236f6b37b9919d25efe8 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -640,6 +640,7 @@ static int smu_sw_init(void *handle) mutex_init(&smu->message_lock); INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn); + atomic64_set(&smu->throttle_int_counter, 0); smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; diff --git a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c index 3b9182c8c53f8f2e779695417a209786c386a7f6..f13979687b9e61a6343bbb4a824ad5ca5d1d9d33 100644 --- a/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/arcturus_ppt.c @@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu) dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n", log_buf); + kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status); } static const struct pptable_funcs arcturus_ppt_funcs = { diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index 28312d6dc187c364d904523712b1ee8dc733daf2..b57b1040639078f093ab887888b0c36974c697fc 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -446,6 +446,7 @@ struct smu_context bool dc_controlled_by_gpio; struct work_struct throttling_logging_work; + atomic64_t throttle_int_counter; }; struct i2c_adapter; diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index fd82402065e638504830f8816fb906f2115909d4..a9453ec01619d6c4389af7530b57f173f297f1dc 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, smu_v11_0_ack_ac_dc_interrupt(&adev->smu); break; case 0x7: + /* + * Increment the throttle interrupt counter + */ + atomic64_inc(&smu->throttle_int_counter); + if (!atomic_read(&adev->throttling_logging_enabled)) return 0; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index f738c3b53f4eaba4a58f8427912f7f3221b321eb..df6c7a43aadcffeb2bfc808a63779aedcb704484 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -450,7 +450,8 @@ struct kfd_ioctl_import_dmabuf_args { * KFD SMI(System Management Interface) events */ /* Event type (defined by bitmask) */ -#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 +#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 +#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002 struct kfd_ioctl_smi_events_args { __u32 gpuid; /* to KFD */