drm/amdgpu: support gfx ras error injection and err_cnt query
authorDennis Li <Dennis.Li@amd.com>
Wed, 31 Jul 2019 12:45:50 +0000 (20:45 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 31 Jul 2019 19:51:14 +0000 (14:51 -0500)
check gfx error count in both ras querry function and
ras interrupt handler.

gfx ras is still disabled by default due to known stability
issue found in gpu reset.

Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 0c31bd06a7e8b23432175aea57a45a5f63f9dd7b..e15fedb0ce7340df68c220833c5f727cc19ecfa1 100644 (file)
@@ -602,6 +602,10 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
                if (adev->umc.funcs->query_ras_error_count)
                        adev->umc.funcs->query_ras_error_count(adev, &err_data);
                break;
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->query_ras_error_count)
+                       adev->gfx.funcs->query_ras_error_count(adev, &err_data);
+               break;
        default:
                break;
        }
@@ -639,13 +643,22 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
        if (!obj)
                return -EINVAL;
 
-       if (block_info.block_id != TA_RAS_BLOCK__UMC) {
+       switch (info->head.block) {
+       case AMDGPU_RAS_BLOCK__GFX:
+               if (adev->gfx.funcs->ras_error_inject)
+                       ret = adev->gfx.funcs->ras_error_inject(adev, info);
+               else
+                       ret = -EINVAL;
+               break;
+       case AMDGPU_RAS_BLOCK__UMC:
+               ret = psp_ras_trigger_error(&adev->psp, &block_info);
+               break;
+       default:
                DRM_INFO("%s error injection is not supported yet\n",
                         ras_block_str(info->head.block));
-               return -EINVAL;
+               ret = -EINVAL;
        }
 
-       ret = psp_ras_trigger_error(&adev->psp, &block_info);
        if (ret)
                DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
                                ras_block_str(info->head.block),
index c6ad662602bef51d2b7766efb54c39026c962502..bd82f6303bd6ac3a8b64cd2092867d4b98a981f0 100644 (file)
@@ -5611,6 +5611,8 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
 {
        /* TODO ue will trigger an interrupt. */
        kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+       if (adev->gfx.funcs->query_ras_error_count)
+               adev->gfx.funcs->query_ras_error_count(adev, err_data);
        amdgpu_ras_reset_gpu(adev, 0);
        return AMDGPU_RAS_UE;
 }