drm/vc4: Add fragment shader threading support
authorJonas Pfeil <pfeiljonas@gmx.de>
Mon, 7 Nov 2016 23:18:39 +0000 (00:18 +0100)
committerEric Anholt <eric@anholt.net>
Wed, 16 Nov 2016 21:25:26 +0000 (13:25 -0800)
FS threading brings performance improvements of 0-20% in glmark2.

The validation code checks for thread switch signals and ensures that
the registers of the other thread are not touched, and that our clamps
are not live across thread switches.  It also checks that the
threading and branching instructions do not interfere.

(Original patch by Jonas, changes by anholt for style cleanup,
removing validation the kernel doesn't need to do, and adding the flag
for userspace).

v2: Minor style fixes from checkpatch.

Signed-off-by: Jonas Pfeil <pfeiljonas@gmx.de>
Signed-off-by: Eric Anholt <eric@anholt.net>
drivers/gpu/drm/vc4/vc4_drv.c
drivers/gpu/drm/vc4/vc4_drv.h
drivers/gpu/drm/vc4/vc4_validate.c
drivers/gpu/drm/vc4/vc4_validate_shaders.c
include/uapi/drm/vc4_drm.h

index 7abfe088f2d109706103a37ba342b6a16227dfb3..86aabf6d0f79139dffcc79f49cb9a23a1ed8cc9d 100644 (file)
@@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
                break;
        case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
        case DRM_VC4_PARAM_SUPPORTS_ETC1:
+       case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
                args->value = true;
                break;
        default:
index 7c1e4d97486fb57d9ce7002c30a23138519761ae..fef17280434526ad6b687f5c98b7e443cb4da695 100644 (file)
@@ -381,6 +381,8 @@ struct vc4_validated_shader_info {
 
        uint32_t num_uniform_addr_offsets;
        uint32_t *uniform_addr_offsets;
+
+       bool is_threaded;
 };
 
 /**
index e18f88203d32f828b7256a05c653586c14095ef3..9fd171c361c23b52a4d507919ec7e26fd1e87aac 100644 (file)
@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device *dev,
        exec->shader_rec_v += roundup(packet_size, 16);
        exec->shader_rec_size -= packet_size;
 
-       if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
-               DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
-               return -EINVAL;
-       }
-
        for (i = 0; i < shader_reloc_count; i++) {
                if (src_handles[i] > exec->bo_count) {
                        DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device *dev,
                        return -EINVAL;
        }
 
+       if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
+           to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
+               DRM_ERROR("Thread mode of CL and FS do not match\n");
+               return -EINVAL;
+       }
+
+       if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
+           to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
+               DRM_ERROR("cs and vs cannot be threaded\n");
+               return -EINVAL;
+       }
+
        for (i = 0; i < shader_reloc_count; i++) {
                struct vc4_validated_shader_info *validated_shader;
                uint32_t o = shader_reloc_offsets[i];
index 917321ce832ffda9d3e8ca20d987437eea9a1765..5dba13dd1e9b600b43a769d086d6eb428547ab66 100644 (file)
@@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
         * basic blocks.
         */
        bool needs_uniform_address_for_loop;
+
+       /* Set when we find an instruction writing the top half of the
+        * register files.  If we allowed writing the unusable regs in
+        * a threaded shader, then the other shader running on our
+        * QPU's clamp validation would be invalid.
+        */
+       bool all_registers_used;
 };
 
 static uint32_t
@@ -118,6 +125,13 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
                return ~0;
 }
 
+static bool
+live_reg_is_upper_half(uint32_t lri)
+{
+       return  (lri >= 16 && lri < 32) ||
+               (lri >= 32 + 16 && lri < 32 + 32);
+}
+
 static bool
 is_tmu_submit(uint32_t waddr)
 {
@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
                } else {
                        validation_state->live_immediates[lri] = ~0;
                }
+
+               if (live_reg_is_upper_half(lri))
+                       validation_state->all_registers_used = true;
        }
 
        switch (waddr) {
@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
                }
        }
 
+       if ((raddr_a >= 16 && raddr_a < 32) ||
+           (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
+               validation_state->all_registers_used = true;
+       }
+
        return true;
 }
 
@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 {
        bool found_shader_end = false;
        int shader_end_ip = 0;
+       uint32_t last_thread_switch_ip = -3;
        uint32_t ip;
        struct vc4_validated_shader_info *validated_shader = NULL;
        struct vc4_shader_validation_state validation_state;
@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                if (!vc4_handle_branch_target(&validation_state))
                        goto fail;
 
+               if (ip == last_thread_switch_ip + 3) {
+                       /* Reset r0-r3 live clamp data */
+                       int i;
+
+                       for (i = 64; i < LIVE_REG_COUNT; i++) {
+                               validation_state.live_min_clamp_offsets[i] = ~0;
+                               validation_state.live_max_clamp_regs[i] = false;
+                               validation_state.live_immediates[i] = ~0;
+                       }
+               }
+
                switch (sig) {
                case QPU_SIG_NONE:
                case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                case QPU_SIG_LOAD_TMU1:
                case QPU_SIG_PROG_END:
                case QPU_SIG_SMALL_IMM:
+               case QPU_SIG_THREAD_SWITCH:
+               case QPU_SIG_LAST_THREAD_SWITCH:
                        if (!check_instruction_writes(validated_shader,
                                                      &validation_state)) {
                                DRM_ERROR("Bad write at ip %d\n", ip);
@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                                shader_end_ip = ip;
                        }
 
+                       if (sig == QPU_SIG_THREAD_SWITCH ||
+                           sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                               validated_shader->is_threaded = true;
+
+                               if (ip < last_thread_switch_ip + 3) {
+                                       DRM_ERROR("Thread switch too soon after "
+                                                 "last switch at ip %d\n", ip);
+                                       goto fail;
+                               }
+                               last_thread_switch_ip = ip;
+                       }
+
                        break;
 
                case QPU_SIG_LOAD_IMM:
@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                        if (!check_branch(inst, validated_shader,
                                          &validation_state, ip))
                                goto fail;
+
+                       if (ip < last_thread_switch_ip + 3) {
+                               DRM_ERROR("Branch in thread switch at ip %d",
+                                         ip);
+                               goto fail;
+                       }
+
                        break;
                default:
                        DRM_ERROR("Unsupported QPU signal %d at "
@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                goto fail;
        }
 
+       /* Might corrupt other thread */
+       if (validated_shader->is_threaded &&
+           validation_state.all_registers_used) {
+               DRM_ERROR("Shader uses threading, but uses the upper "
+                         "half of the registers, too\n");
+               goto fail;
+       }
+
        /* If we did a backwards branch and we haven't emitted a uniforms
         * reset since then, we still need the uniforms stream to have the
         * uniforms address available so that the backwards branch can do its
index 69caa21f0cb23c9439238f6239c0041b178d5669..f07a090167261131076438960c1dec175d7d376e 100644 (file)
@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
 #define DRM_VC4_PARAM_V3D_IDENT2               2
 #define DRM_VC4_PARAM_SUPPORTS_BRANCHES                3
 #define DRM_VC4_PARAM_SUPPORTS_ETC1            4
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS     5
 
 struct drm_vc4_get_param {
        __u32 param;