Skip to content

Commit 52211a6

Browse files
authored
Restrict use of angular endpoint search (#332)
The angular weight endpoint search is expensive, and of limited value for weights with finer quantization applied. This PR limits the search to coarser quant levels, fitting the entire search inside a single vec8 SIMD iteration.
1 parent 50dfa8a commit 52211a6

8 files changed

Lines changed: 514 additions & 471 deletions

Source/astcenc_block_sizes.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1100,7 +1100,6 @@ static void construct_block_size_descriptor_3d(
11001100
bsd.decimation_mode_count_selected = decimation_mode_count;
11011101
bsd.decimation_mode_count_all = decimation_mode_count;
11021102

1103-
// Construct the list of block formats
11041103
// Construct the list of block formats referencing the decimation tables
11051104

11061105
// Clear the list to a known-bad value

Source/astcenc_entry.cpp

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,13 @@ struct astcenc_preset_config
6262
static const std::array<astcenc_preset_config, 5> preset_configs_high {{
6363
{
6464
ASTCENC_PRE_FASTEST,
65-
2, 8, 42, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
65+
2, 10, 42, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 25
6666
}, {
6767
ASTCENC_PRE_FAST,
68-
3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
68+
3, 14, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.65f, 20
6969
}, {
7070
ASTCENC_PRE_MEDIUM,
71-
4, 26, 76, 3, 3 , 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
71+
4, 28, 76, 3, 3 , 95.0f, 70.0f, 2.5f, 2.5f, 1.2f, 1.25f, 0.85f, 16
7272
}, {
7373
ASTCENC_PRE_THOROUGH,
7474
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 12
@@ -78,21 +78,20 @@ static const std::array<astcenc_preset_config, 5> preset_configs_high {{
7878
}
7979
}};
8080

81-
8281
/**
8382
* @brief The static quality presets that are built-in for medium bandwidth
8483
* presets (25 <= x < 64 texels per block).
8584
*/
8685
static const std::array<astcenc_preset_config, 5> preset_configs_mid {{
8786
{
8887
ASTCENC_PRE_FASTEST,
89-
2, 8, 40, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
88+
2, 10, 40, 2, 2, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.0f, 0.5f, 20
9089
}, {
9190
ASTCENC_PRE_FAST,
92-
3, 12, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
91+
3, 14, 55, 3, 3, 85.2f, 63.2f, 3.5f, 3.5f, 1.0f, 1.1f, 0.5f, 16
9392
}, {
9493
ASTCENC_PRE_MEDIUM,
95-
4, 26, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
94+
4, 28, 76, 3, 3, 95.0f, 70.0f, 3.0f, 3.0f, 1.2f, 1.25f, 0.75f, 14
9695
}, {
9796
ASTCENC_PRE_THOROUGH,
9897
4, 76, 93, 4, 4, 105.0f, 77.0f, 10.0f, 10.0f, 2.5f, 1.25f, 0.95f, 10

Source/astcenc_internal.h

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -129,12 +129,27 @@ static constexpr float ERROR_CALC_DEFAULT { 1e30f };
129129
static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
130130

131131
/**
132-
* @brief The maximum number of candidate encodings tested for each encoding mode..
132+
* @brief The maximum number of candidate encodings tested for each encoding mode.
133133
*
134134
* This can be dynamically reduced by the compression quality preset.
135135
*/
136136
static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 4 };
137137

138+
/**
139+
* @brief The maximum quant level using full angular endpoint search method.
140+
*
141+
* The angular endpoint search is used to find the min/max weight that should
142+
* be used for a given quantization level. It is effective but expensive, so
143+
* we only use it where it has the most value - low quant levels with wide
144+
* spacing. It is used below TUNE_MAX_ANGULAR_QUANT (inclusive). Above this we
145+
* assume the min weight is 0.0f, and the max weight is 1.0f.
146+
*
147+
* Note the angular algorithm is vectorized, and using QUANT_12 exactly fills
148+
* one 8-wide vector. Decreasing by one doesn't buy much performance, and
149+
* increasing by one is disproportionately expensive.
150+
*/
151+
static constexpr unsigned int TUNE_MAX_ANGULAR_QUANT { 7 }; /* QUANT_12 */
152+
138153

139154
static_assert((BLOCK_MAX_TEXELS % ASTCENC_SIMD_WIDTH) == 0,
140155
"BLOCK_MAX_TEXELS must be multiple of ASTCENC_SIMD_WIDTH");
@@ -1135,10 +1150,10 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
11351150
float weight_high_value1[WEIGHTS_MAX_BLOCK_MODES];
11361151

11371152
/** @brief The low weight value in plane 1 for each quant level and decimation mode. */
1138-
float weight_low_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
1153+
float weight_low_values1[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
11391154

11401155
/** @brief The high weight value in plane 1 for each quant level and decimation mode. */
1141-
float weight_high_values1[WEIGHTS_MAX_DECIMATION_MODES][12];
1156+
float weight_high_values1[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
11421157

11431158
/** @brief The low weight value in plane 2 for each block mode. */
11441159
float weight_low_value2[WEIGHTS_MAX_BLOCK_MODES];
@@ -1147,10 +1162,10 @@ struct alignas(ASTCENC_VECALIGN) compression_working_buffers
11471162
float weight_high_value2[WEIGHTS_MAX_BLOCK_MODES];
11481163

11491164
/** @brief The low weight value in plane 2 for each quant level and decimation mode. */
1150-
float weight_low_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
1165+
float weight_low_values2[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
11511166

11521167
/** @brief The high weight value in plane 2 for each quant level and decimation mode. */
1153-
float weight_high_values2[WEIGHTS_MAX_DECIMATION_MODES][12];
1168+
float weight_high_values2[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
11541169
};
11551170

11561171
struct dt_init_working_buffers

Source/astcenc_weight_align.cpp

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -236,8 +236,8 @@ static void compute_angular_endpoints_for_quant_levels(
236236
unsigned int weight_count,
237237
const float* dec_weight_ideal_value,
238238
unsigned int max_quant_level,
239-
float low_value[12],
240-
float high_value[12]
239+
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
240+
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
241241
) {
242242
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
243243
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
@@ -414,8 +414,8 @@ static void compute_angular_endpoints_for_quant_levels_lwc(
414414
unsigned int weight_count,
415415
const float* dec_weight_ideal_value,
416416
unsigned int max_quant_level,
417-
float low_value[12],
418-
float high_value[12]
417+
float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
418+
float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
419419
) {
420420
unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
421421
unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
@@ -493,8 +493,8 @@ void compute_angular_endpoints_1plane(
493493
float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
494494
float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
495495

496-
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
497-
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
496+
float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
497+
float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
498498

499499
unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
500500
: bsd.decimation_mode_count_selected;
@@ -509,19 +509,25 @@ void compute_angular_endpoints_1plane(
509509

510510
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
511511

512+
unsigned int max_precision = dm.maxprec_1plane;
513+
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
514+
{
515+
max_precision = TUNE_MAX_ANGULAR_QUANT;
516+
}
517+
512518
if (weight_count < tune_low_weight_limit)
513519
{
514520
compute_angular_endpoints_for_quant_levels_lwc(
515521
weight_count,
516522
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
517-
dm.maxprec_1plane, low_values[i], high_values[i]);
523+
max_precision, low_values[i], high_values[i]);
518524
}
519525
else
520526
{
521527
compute_angular_endpoints_for_quant_levels(
522528
weight_count,
523529
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
524-
dm.maxprec_1plane, low_values[i], high_values[i]);
530+
max_precision, low_values[i], high_values[i]);
525531
}
526532
}
527533

@@ -536,8 +542,16 @@ void compute_angular_endpoints_1plane(
536542
unsigned int quant_mode = bm.quant_mode;
537543
unsigned int decim_mode = bm.decimation_mode;
538544

539-
low_value[i] = low_values[decim_mode][quant_mode];
540-
high_value[i] = high_values[decim_mode][quant_mode];
545+
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
546+
{
547+
low_value[i] = low_values[decim_mode][quant_mode];
548+
high_value[i] = high_values[decim_mode][quant_mode];
549+
}
550+
else
551+
{
552+
low_value[i] = 0.0f;
553+
high_value[i] = 1.0f;
554+
}
541555
}
542556
}
543557

@@ -553,10 +567,10 @@ void compute_angular_endpoints_2planes(
553567
float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
554568
float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
555569

556-
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values1;
557-
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values1;
558-
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_low_values2;
559-
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][12] = tmpbuf.weight_high_values2;
570+
float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
571+
float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
572+
float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
573+
float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
560574

561575
promise(bsd.decimation_mode_count_selected > 0);
562576
for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
@@ -569,29 +583,35 @@ void compute_angular_endpoints_2planes(
569583

570584
unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
571585

586+
unsigned int max_precision = dm.maxprec_2planes;
587+
if (max_precision > TUNE_MAX_ANGULAR_QUANT)
588+
{
589+
max_precision = TUNE_MAX_ANGULAR_QUANT;
590+
}
591+
572592
if (weight_count < tune_low_weight_limit)
573593
{
574594
compute_angular_endpoints_for_quant_levels_lwc(
575595
weight_count,
576596
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
577-
dm.maxprec_2planes, low_values1[i], high_values1[i]);
597+
max_precision, low_values1[i], high_values1[i]);
578598

579599
compute_angular_endpoints_for_quant_levels_lwc(
580600
weight_count,
581601
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
582-
dm.maxprec_2planes, low_values2[i], high_values2[i]);
602+
max_precision, low_values2[i], high_values2[i]);
583603
}
584604
else
585605
{
586606
compute_angular_endpoints_for_quant_levels(
587607
weight_count,
588608
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
589-
dm.maxprec_2planes, low_values1[i], high_values1[i]);
609+
max_precision, low_values1[i], high_values1[i]);
590610

591611
compute_angular_endpoints_for_quant_levels(
592612
weight_count,
593613
dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
594-
dm.maxprec_2planes, low_values2[i], high_values2[i]);
614+
max_precision, low_values2[i], high_values2[i]);
595615
}
596616
}
597617

@@ -603,10 +623,20 @@ void compute_angular_endpoints_2planes(
603623
unsigned int quant_mode = bm.quant_mode;
604624
unsigned int decim_mode = bm.decimation_mode;
605625

606-
low_value1[i] = low_values1[decim_mode][quant_mode];
607-
high_value1[i] = high_values1[decim_mode][quant_mode];
608-
low_value2[i] = low_values2[decim_mode][quant_mode];
609-
high_value2[i] = high_values2[decim_mode][quant_mode];
626+
if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
627+
{
628+
low_value1[i] = low_values1[decim_mode][quant_mode];
629+
high_value1[i] = high_values1[decim_mode][quant_mode];
630+
low_value2[i] = low_values2[decim_mode][quant_mode];
631+
high_value2[i] = high_values2[decim_mode][quant_mode];
632+
}
633+
else
634+
{
635+
low_value1[i] = 0.0f;
636+
high_value1[i] = 1.0f;
637+
low_value2[i] = 0.0f;
638+
high_value2[i] = 1.0f;
639+
}
610640
}
611641
}
612642

0 commit comments

Comments
 (0)