提交 8840a387 编写于 作者: P pding 提交者: Alex Deucher

drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)

The exclusive mode has real-time limitation in reality, such like being
done in 300ms. It's easy observed if running many VF/VMs in single host
with heavy CPU workload.

If we find the init fails due to exclusive mode timeout, try it again.

v2:
 - rewrite the condition for readable value.

v3:
 - fix typo, add comments for sleep
Acked-by: NAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: Npding <Pixel.Ding@amd.com>
Signed-off-by: NAlex Deucher <alexander.deucher@amd.com>
上级 b5914238
...@@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_init(adev); r = amdgpu_init(adev);
if (r) { if (r) {
/* failed in exclusive mode due to timeout */
if (amdgpu_sriov_vf(adev) &&
!amdgpu_sriov_runtime(adev) &&
amdgpu_virt_mmio_blocked(adev) &&
!amdgpu_virt_wait_reset(adev)) {
dev_err(adev->dev, "VF exclusive mode timeout\n");
r = -EAGAIN;
goto failed;
}
dev_err(adev->dev, "amdgpu_init failed\n"); dev_err(adev->dev, "amdgpu_init failed\n");
amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0); amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
amdgpu_fini(adev); amdgpu_fini(adev);
...@@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
amdgpu_vf_error_trans_all(adev); amdgpu_vf_error_trans_all(adev);
if (runtime) if (runtime)
vga_switcheroo_fini_domain_pm_ops(adev->dev); vga_switcheroo_fini_domain_pm_ops(adev->dev);
return r; return r;
} }
......
...@@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev) ...@@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
{ {
struct amdgpu_device *adev; struct amdgpu_device *adev;
int r, acpi_status; int r, acpi_status, retry = 0;
#ifdef CONFIG_DRM_AMDGPU_SI #ifdef CONFIG_DRM_AMDGPU_SI
if (!amdgpu_si_support) { if (!amdgpu_si_support) {
...@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) ...@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
} }
} }
#endif #endif
retry_init:
adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL); adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
if (adev == NULL) { if (adev == NULL) {
...@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) ...@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
* VRAM allocation * VRAM allocation
*/ */
r = amdgpu_device_init(adev, dev, dev->pdev, flags); r = amdgpu_device_init(adev, dev, dev->pdev, flags);
if (r) { if (r == -EAGAIN && ++retry <= 3) {
adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
adev->virt.ops = NULL;
amdgpu_device_fini(adev);
kfree(adev);
dev->dev_private = NULL;
/* Don't request EX mode too frequently which is attacking */
msleep(5000);
dev_err(&dev->pdev->dev, "retry init %d\n", retry);
goto retry_init;
} else if (r) {
dev_err(&dev->pdev->dev, "Fatal error during GPU init\n"); dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
goto out; goto out;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册