drm/amd/amdgpu:Fix compute ring unable to detect hang.
authorJesse Zhang <zhexi.zhang@amd.com>
Tue, 30 Jul 2019 11:15:42 +0000 (19:15 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Thu, 3 Oct 2019 14:11:01 +0000 (09:11 -0500)
When compute fence did not signal, compute ring cannot detect hardware hang
because its timeout value is set to be infinite by default.

In SR-IOV and passthrough mode, if user does not declare custome timeout
value for compute ring, then use gfx ring timeout value as default. So
that when there is a ture hardware hang, compute ring can detect it.

Signed-off-by: Jesse Zhang <zhexi.zhang@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

index 9f916a3314594635bcb68da06ffaddc35e88b4f6..1364a2be68e0c5ac32aa27072c24ca17c09079c4 100644 (file)
@@ -1025,12 +1025,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 
        amdgpu_device_check_block_size(adev);
 
-       ret = amdgpu_device_get_job_timeout_settings(adev);
-       if (ret) {
-               dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
-               return ret;
-       }
-
        adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
 
        return ret;
@@ -2745,6 +2739,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        if (r)
                return r;
 
+       r = amdgpu_device_get_job_timeout_settings(adev);
+       if (r) {
+               dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
+               return r;
+       }
+
        /* doorbell bar mapping and doorbell index init*/
        amdgpu_device_doorbell_init(adev);
 
index e2f166a32c92bc80c2d432654f81a7d144fb6289..864f7858d63073e62f856e44ee8381800f781ccb 100644 (file)
@@ -1341,10 +1341,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
        /*
         * By default timeout for non compute jobs is 10000.
         * And there is no timeout enforced on compute jobs.
+        * In SR-IOV or passthrough mode, timeout for compute
+        * jobs are 10000 by default.
         */
        adev->gfx_timeout = msecs_to_jiffies(10000);
        adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
-       adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
+       if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
+               adev->compute_timeout = adev->gfx_timeout;
+       else
+               adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
 
        if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) {
                while ((timeout_setting = strsep(&input, ",")) &&
index 23085b352cf2d91886d13660a21ba9e98a3b4c44..377fe20bce23ca540f5d4eba6996f8ec7674f2af 100644 (file)
@@ -462,18 +462,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
                        timeout = adev->gfx_timeout;
                        break;
                case AMDGPU_RING_TYPE_COMPUTE:
-                       /*
-                        * For non-sriov case, no timeout enforce
-                        * on compute ring by default. Unless user
-                        * specifies a timeout for compute ring.
-                        *
-                        * For sriov case, always use the timeout
-                        * as gfx ring
-                        */
-                       if (!amdgpu_sriov_vf(ring->adev))
-                               timeout = adev->compute_timeout;
-                       else
-                               timeout = adev->gfx_timeout;
+                       timeout = adev->compute_timeout;
                        break;
                case AMDGPU_RING_TYPE_SDMA:
                        timeout = adev->sdma_timeout;