From: Krunoslav Kovac Date: Tue, 18 Jun 2019 21:38:43 +0000 (-0400) Subject: drm/amd/display: Optimize gamma calculations X-Git-Url: http://git.lede-project.org./?a=commitdiff_plain;h=e752058b8671c6c87e484cff144c5c6309a37253;p=openwrt%2Fstaging%2Fblogic.git drm/amd/display: Optimize gamma calculations [Why&How] 1. Stack usage is pretty high as fixed31_32 struct is 8 bytes and we have functions with >30 vars on the stack. 2. Optimize gamma calculation by reducing number of calls to dc_fixpt_pow Our X points are divided into 32 regions wth 16 pts each. Each region is 2x the previous, meaning x[i] = 2*x[i-16] for i>=16. Using (2x)^gamma = 2^gamma * x^gamma, we can recursively compute powers of gamma, we just need first 16 pts to start it up. dc_fixpt_pow() is expensive, it computes x^y by doing exp(y*logx) Exp is done by Taylor series approximation, and log by Newton-like approximation that also uses exp internally. In short, it's significantly heavier than run-of-the-mill addition/subtraction/multiply. Signed-off-by: Krunoslav Kovac Reviewed-by: Anthony Koo Acked-by: Aric Cyr Acked-by: Leo Li Signed-off-by: Alex Deucher --- diff --git a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h index 22db5682aa6c..e9a6225f4720 100644 --- a/drivers/gpu/drm/amd/display/dc/dc_hw_types.h +++ b/drivers/gpu/drm/amd/display/dc/dc_hw_types.h @@ -482,7 +482,6 @@ struct dc_gamma { * is_logical_identity indicates the given gamma ramp regardless of type is identity. */ bool is_identity; - bool is_logical_identity; }; /* Used by both ipp amd opp functions*/ diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c index 88898935a5e6..ed894cddeee5 100644 --- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.c +++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.c @@ -40,6 +40,33 @@ static struct hw_x_point coordinates_x[MAX_HW_POINTS + 2]; static struct fixed31_32 pq_table[MAX_HW_POINTS + 2]; static struct fixed31_32 de_pq_table[MAX_HW_POINTS + 2]; +// these are helpers for calculations to reduce stack usage +// do not depend on these being preserved across calls +static struct fixed31_32 scratch_1; +static struct fixed31_32 scratch_2; +static struct translate_from_linear_space_args scratch_gamma_args; + +/* Helper to optimize gamma calculation, only use in translate_from_linear, in + * particular the dc_fixpt_pow function which is very expensive + * The idea is that our regions for X points are exponential and currently they all use + * the same number of points (NUM_PTS_IN_REGION) and in each region every point + * is exactly 2x the one at the same index in the previous region. In other words + * X[i] = 2 * X[i-NUM_PTS_IN_REGION] for i>=16 + * The other fact is that (2x)^gamma = 2^gamma * x^gamma + * So we compute and save x^gamma for the first 16 regions, and for every next region + * just multiply with 2^gamma which can be computed once, and save the result so we + * recursively compute all the values. + */ +static struct fixed31_32 pow_buffer[NUM_PTS_IN_REGION]; +static struct fixed31_32 gamma_of_2; // 2^gamma +int pow_buffer_ptr = -1; + +static const int32_t gamma_numerator01[] = { 31308, 180000, 0}; +static const int32_t gamma_numerator02[] = { 12920, 4500, 0}; +static const int32_t gamma_numerator03[] = { 55, 99, 0}; +static const int32_t gamma_numerator04[] = { 55, 99, 0}; +static const int32_t gamma_numerator05[] = { 2400, 2200, 2200}; + static bool pq_initialized; /* = false; */ static bool de_pq_initialized; /* = false; */ @@ -251,11 +278,7 @@ enum gamma_type_index { static void build_coefficients(struct gamma_coefficients *coefficients, enum gamma_type_index type) { - static const int32_t numerator01[] = { 31308, 180000, 0}; - static const int32_t numerator02[] = { 12920, 4500, 0}; - static const int32_t numerator03[] = { 55, 99, 0}; - static const int32_t numerator04[] = { 55, 99, 0}; - static const int32_t numerator05[] = { 2400, 2200, 2200}; + uint32_t i = 0; uint32_t index = 0; @@ -267,69 +290,74 @@ static void build_coefficients(struct gamma_coefficients *coefficients, enum gam do { coefficients->a0[i] = dc_fixpt_from_fraction( - numerator01[index], 10000000); + gamma_numerator01[index], 10000000); coefficients->a1[i] = dc_fixpt_from_fraction( - numerator02[index], 1000); + gamma_numerator02[index], 1000); coefficients->a2[i] = dc_fixpt_from_fraction( - numerator03[index], 1000); + gamma_numerator03[index], 1000); coefficients->a3[i] = dc_fixpt_from_fraction( - numerator04[index], 1000); + gamma_numerator04[index], 1000); coefficients->user_gamma[i] = dc_fixpt_from_fraction( - numerator05[index], 1000); + gamma_numerator05[index], 1000); ++i; } while (i != ARRAY_SIZE(coefficients->a0)); } static struct fixed31_32 translate_from_linear_space( - struct fixed31_32 arg, - struct fixed31_32 a0, - struct fixed31_32 a1, - struct fixed31_32 a2, - struct fixed31_32 a3, - struct fixed31_32 gamma) + struct translate_from_linear_space_args *args) { const struct fixed31_32 one = dc_fixpt_from_int(1); - if (dc_fixpt_lt(one, arg)) + if (dc_fixpt_le(one, args->arg)) return one; - if (dc_fixpt_le(arg, dc_fixpt_neg(a0))) - return dc_fixpt_sub( - a2, - dc_fixpt_mul( - dc_fixpt_add( - one, - a3), - dc_fixpt_pow( - dc_fixpt_neg(arg), - dc_fixpt_recip(gamma)))); - else if (dc_fixpt_le(a0, arg)) - return dc_fixpt_sub( - dc_fixpt_mul( - dc_fixpt_add( - one, - a3), - dc_fixpt_pow( - arg, - dc_fixpt_recip(gamma))), - a2); + if (dc_fixpt_le(args->arg, dc_fixpt_neg(args->a0))) { + scratch_1 = dc_fixpt_add(one, args->a3); + scratch_2 = dc_fixpt_pow( + dc_fixpt_neg(args->arg), + dc_fixpt_recip(args->gamma)); + scratch_1 = dc_fixpt_mul(scratch_1, scratch_2); + scratch_1 = dc_fixpt_sub(args->a2, scratch_1); + + return scratch_1; + } else if (dc_fixpt_le(args->a0, args->arg)) { + if (pow_buffer_ptr == 0) { + gamma_of_2 = dc_fixpt_pow(dc_fixpt_from_int(2), + dc_fixpt_recip(args->gamma)); + } + scratch_1 = dc_fixpt_add(one, args->a3); + if (pow_buffer_ptr < 16) + scratch_2 = dc_fixpt_pow(args->arg, + dc_fixpt_recip(args->gamma)); + else + scratch_2 = dc_fixpt_mul(gamma_of_2, + pow_buffer[pow_buffer_ptr%16]); + + pow_buffer[pow_buffer_ptr%16] = scratch_2; + pow_buffer_ptr++; + + scratch_1 = dc_fixpt_mul(scratch_1, scratch_2); + scratch_1 = dc_fixpt_sub(scratch_1, args->a2); + + return scratch_1; + } else - return dc_fixpt_mul( - arg, - a1); + return dc_fixpt_mul(args->arg, args->a1); } static struct fixed31_32 calculate_gamma22(struct fixed31_32 arg) { struct fixed31_32 gamma = dc_fixpt_from_fraction(22, 10); - return translate_from_linear_space(arg, - dc_fixpt_zero, - dc_fixpt_zero, - dc_fixpt_zero, - dc_fixpt_zero, - gamma); + scratch_gamma_args.arg = arg; + scratch_gamma_args.a0 = dc_fixpt_zero; + scratch_gamma_args.a1 = dc_fixpt_zero; + scratch_gamma_args.a2 = dc_fixpt_zero; + scratch_gamma_args.a3 = dc_fixpt_zero; + scratch_gamma_args.gamma = gamma; + + return translate_from_linear_space(&scratch_gamma_args); } static struct fixed31_32 translate_to_linear_space( @@ -365,18 +393,19 @@ static struct fixed31_32 translate_to_linear_space( return linear; } -static inline struct fixed31_32 translate_from_linear_space_ex( +static struct fixed31_32 translate_from_linear_space_ex( struct fixed31_32 arg, struct gamma_coefficients *coeff, uint32_t color_index) { - return translate_from_linear_space( - arg, - coeff->a0[color_index], - coeff->a1[color_index], - coeff->a2[color_index], - coeff->a3[color_index], - coeff->user_gamma[color_index]); + scratch_gamma_args.arg = arg; + scratch_gamma_args.a0 = coeff->a0[color_index]; + scratch_gamma_args.a1 = coeff->a1[color_index]; + scratch_gamma_args.a2 = coeff->a2[color_index]; + scratch_gamma_args.a3 = coeff->a3[color_index]; + scratch_gamma_args.gamma = coeff->user_gamma[color_index]; + + return translate_from_linear_space(&scratch_gamma_args); } @@ -715,24 +744,32 @@ static void build_regamma(struct pwl_float_data_ex *rgb_regamma, { uint32_t i; - struct gamma_coefficients coeff; + struct gamma_coefficients *coeff; struct pwl_float_data_ex *rgb = rgb_regamma; const struct hw_x_point *coord_x = coordinate_x; - build_coefficients(&coeff, type); + coeff = kvzalloc(sizeof(*coeff), GFP_KERNEL); + if (!coeff) + return; - i = 0; + build_coefficients(coeff, type); - while (i != hw_points_num + 1) { + memset(pow_buffer, 0, NUM_PTS_IN_REGION * sizeof(struct fixed31_32)); + pow_buffer_ptr = 0; // see variable definition for more info + i = 0; + while (i <= hw_points_num) { /*TODO use y vs r,g,b*/ rgb->r = translate_from_linear_space_ex( - coord_x->x, &coeff, 0); + coord_x->x, coeff, 0); rgb->g = rgb->r; rgb->b = rgb->r; ++coord_x; ++rgb; ++i; } + pow_buffer_ptr = -1; // reset back to no optimize + + kfree(coeff); } static void hermite_spline_eetf(struct fixed31_32 input_x, @@ -862,6 +899,8 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma, else max_content = max_display; + if (!use_eetf) + pow_buffer_ptr = 0; // see var definition for more info rgb += 32; // first 32 points have problems with fixed point, too small coord_x += 32; for (i = 32; i <= hw_points_num; i++) { @@ -900,6 +939,7 @@ static bool build_freesync_hdr(struct pwl_float_data_ex *rgb_regamma, ++coord_x; ++rgb; } + pow_buffer_ptr = -1; return true; } @@ -1572,14 +1612,15 @@ bool mod_color_calculate_regamma_params(struct dc_transfer_func *output_tf, output_tf->tf == TRANSFER_FUNCTION_SRGB) { if (ramp == NULL) return true; - if ((ramp->is_logical_identity) || + if ((ramp->is_identity && ramp->type != GAMMA_CS_TFM_1D) || (!mapUserRamp && ramp->type == GAMMA_RGB_256)) return true; } output_tf->type = TF_TYPE_DISTRIBUTED_POINTS; - if (ramp && (mapUserRamp || ramp->type != GAMMA_RGB_256)) { + if (ramp && ramp->type != GAMMA_CS_TFM_1D && + (mapUserRamp || ramp->type != GAMMA_RGB_256)) { rgb_user = kvcalloc(ramp->num_entries + _EXTRA_POINTS, sizeof(*rgb_user), GFP_KERNEL); diff --git a/drivers/gpu/drm/amd/display/modules/color/color_gamma.h b/drivers/gpu/drm/amd/display/modules/color/color_gamma.h index 369953fafadf..69cecd2ec251 100644 --- a/drivers/gpu/drm/amd/display/modules/color/color_gamma.h +++ b/drivers/gpu/drm/amd/display/modules/color/color_gamma.h @@ -82,6 +82,15 @@ struct freesync_hdr_tf_params { unsigned int skip_tm; // skip tm }; +struct translate_from_linear_space_args { + struct fixed31_32 arg; + struct fixed31_32 a0; + struct fixed31_32 a1; + struct fixed31_32 a2; + struct fixed31_32 a3; + struct fixed31_32 gamma; +}; + void setup_x_points_distribution(void); void precompute_pq(void); void precompute_de_pq(void);