drm/amdgpu: bypass some cleanup work after err_event_athub (v2)
authorLe Ma <le.ma@amd.com>
Fri, 25 Oct 2019 09:48:52 +0000 (17:48 +0800)
committerAlex Deucher <alexander.deucher@amd.com>
Wed, 30 Oct 2019 15:06:52 +0000 (11:06 -0400)
PSP lost connection when err_event_athub occurs. These cleanup work can be
skipped in BACO reset.

v2: squash in missing include (Alex)

Signed-off-by: Le Ma <le.ma@amd.com>
Reviewed-by: Hawking Zhang <hawking.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index c03089503b0fc2abec21ab27250ad3a7b637d29d..d36d2b09353944fe8fb22ebd9eff352cd1805d93 100644 (file)
@@ -2271,6 +2271,12 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
                /* displays are handled in phase1 */
                if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
                        continue;
+               /* PSP lost connection when err_event_athub occurs */
+               if (amdgpu_ras_intr_triggered() &&
+                   adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
+                       adev->ip_blocks[i].status.hw = false;
+                       continue;
+               }
                /* XXX handle errors */
                r = adev->ip_blocks[i].version->funcs->suspend(adev);
                /* XXX handle errors */
index fd7a73f4fa706201f0014f1e52b2315bc2b29cc0..bbe9ac7e843f94e5f215dc01d3383fc5557d9a34 100644 (file)
@@ -34,6 +34,8 @@
 #include "psp_v11_0.h"
 #include "psp_v12_0.h"
 
+#include "amdgpu_ras.h"
+
 static void psp_set_funcs(struct amdgpu_device *adev);
 
 static int psp_early_init(void *handle)
@@ -167,6 +169,13 @@ psp_cmd_submit_buf(struct psp_context *psp,
        while (*((unsigned int *)psp->fence_buf) != index) {
                if (--timeout == 0)
                        break;
+               /*
+                * Shouldn't wait for timeout when err_event_athub occurs,
+                * because gpu reset thread triggered and lock resource should
+                * be released for psp resume sequence.
+                */
+               if (amdgpu_ras_intr_triggered())
+                       break;
                msleep(1);
                amdgpu_asic_invalidate_hdp(psp->adev, NULL);
        }
index 796326b36e00d7bef550665efc5633613e214e1a..dab90c28047643e1cff7c56861563a39cc67d1bb 100644 (file)
@@ -558,15 +558,17 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
        if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
                return 0;
 
-       ret = psp_ras_enable_features(&adev->psp, &info, enable);
-       if (ret) {
-               DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
-                               enable ? "enable":"disable",
-                               ras_block_str(head->block),
-                               ret);
-               if (ret == TA_RAS_STATUS__RESET_NEEDED)
-                       return -EAGAIN;
-               return -EINVAL;
+       if (!amdgpu_ras_intr_triggered()) {
+               ret = psp_ras_enable_features(&adev->psp, &info, enable);
+               if (ret) {
+                       DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
+                                       enable ? "enable":"disable",
+                                       ras_block_str(head->block),
+                                       ret);
+                       if (ret == TA_RAS_STATUS__RESET_NEEDED)
+                               return -EAGAIN;
+                       return -EINVAL;
+               }
        }
 
        /* setup the obj */
index 088c6a734a1ac2caf00c1ec54aabad62370c43fc..d694be9a8c3929c6a55832bfafdb7dc6809b4b83 100644 (file)
@@ -3736,8 +3736,10 @@ static int gfx_v9_0_hw_fini(void *handle)
        amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
        amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
 
-       /* disable KCQ to avoid CPC touch memory not valid anymore */
-       gfx_v9_0_kcq_disable(adev);
+       /* DF freeze and kcq disable will fail */
+       if (!amdgpu_ras_intr_triggered())
+               /* disable KCQ to avoid CPC touch memory not valid anymore */
+               gfx_v9_0_kcq_disable(adev);
 
        if (amdgpu_sriov_vf(adev)) {
                gfx_v9_0_cp_gfx_enable(adev, false);