drm/amdgpu: retry init if it fails due to exclusive mode timeout (v3)
authorpding <Pixel.Ding@amd.com>
Mon, 23 Oct 2017 09:22:09 +0000 (17:22 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Mon, 4 Dec 2017 21:33:14 +0000 (16:33 -0500)
The exclusive mode has real-time limitation in reality, such like being
done in 300ms. It's easy observed if running many VF/VMs in single host
with heavy CPU workload.

If we find the init fails due to exclusive mode timeout, try it again.

v2:
 - rewrite the condition for readable value.

v3:
 - fix typo, add comments for sleep

Acked-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: pding <Pixel.Ding@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

index 212138476130b4d5d805acaa7c6737f9d92adff7..e521850e9409027190d1c80e8faebc3719c770c6 100644 (file)
@@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
        r = amdgpu_init(adev);
        if (r) {
+               /* failed in exclusive mode due to timeout */
+               if (amdgpu_sriov_vf(adev) &&
+                   !amdgpu_sriov_runtime(adev) &&
+                   amdgpu_virt_mmio_blocked(adev) &&
+                   !amdgpu_virt_wait_reset(adev)) {
+                       dev_err(adev->dev, "VF exclusive mode timeout\n");
+                       r = -EAGAIN;
+                       goto failed;
+               }
                dev_err(adev->dev, "amdgpu_init failed\n");
                amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
                amdgpu_fini(adev);
@@ -2390,6 +2399,7 @@ failed:
        amdgpu_vf_error_trans_all(adev);
        if (runtime)
                vga_switcheroo_fini_domain_pm_ops(adev->dev);
+
        return r;
 }
 
index 720139e182a3d8be1cb2015630e3519f96785d8a..f313eee60c4a78cd179ebb67cfa4357424e3e838 100644 (file)
@@ -86,7 +86,7 @@ done_free:
 int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
 {
        struct amdgpu_device *adev;
-       int r, acpi_status;
+       int r, acpi_status, retry = 0;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
        if (!amdgpu_si_support) {
@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
                }
        }
 #endif
+retry_init:
 
        adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
        if (adev == NULL) {
@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
         * VRAM allocation
         */
        r = amdgpu_device_init(adev, dev, dev->pdev, flags);
-       if (r) {
+       if (r == -EAGAIN && ++retry <= 3) {
+               adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+               adev->virt.ops = NULL;
+               amdgpu_device_fini(adev);
+               kfree(adev);
+               dev->dev_private = NULL;
+               /* Don't request EX mode too frequently which is attacking */
+               msleep(5000);
+               dev_err(&dev->pdev->dev, "retry init %d\n", retry);
+               goto retry_init;
+       } else if (r) {
                dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
                goto out;
        }