Remove folded decimation_info weight arrays

solidpixel · solidpixel · commit d5aff7f6b923 · 2023-01-02T13:32:39.000Z
Falling back to the non-folded arrays is marginally slower on perfect
memory because of additional indirect loads. However, removing the
folded arrays significantly improves caching which offsets the loss.

This change reduces the context creation time and memory footprint of
the compressor. This is most significant for larger block sizes which
have the most decimation_info structures to create and access.
diff --git a/Docs/ChangeLog-4x.md b/Docs/ChangeLog-4x.md
@@ -6,6 +6,21 @@ release of the 4.x series.
 All performance data on this page is measured on an Intel Core i5-9600K
 clocked at 4.2 GHz, running `astcenc` using AVX2 and 6 threads.
 
+<!-- ---------------------------------------------------------------------- -->
+## 4.3.0
+
+**Status:** In development
+
+The 4.3.0 release is an optimization release. There are minor performance
+and image quality improvements in this release.
+
+* **General:**
+  * **Optimization:** Always skip blue-contraction for `QUANT_256` encodings.
+    This gives a small image quality improvement for the 4x4 block size.
+  * **Optimization:** Remove folded `decimation_info` lookup tables. This
+    reduces compressor memory footprint and improves context creation time.
+    Impact increases with the active block size.
+
 <!-- ---------------------------------------------------------------------- -->
 ## 4.2.0
 
@@ -175,4 +190,4 @@ Key for charts:
 
 - - -
 
-_Copyright © 2022, Arm Limited and contributors. All rights reserved._
+_Copyright © 2022-2023, Arm Limited and contributors. All rights reserved._
diff --git a/Source/astcenc_block_sizes.cpp b/Source/astcenc_block_sizes.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -359,30 +359,17 @@ static void init_decimation_info_2d(
 			di.weight_texel[j][i] = texel;
 			di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
 
-			// perform a layer of array unrolling. An aspect of this unrolling is that
-			// one of the texel-weight indexes is an identity-mapped index; we will use this
-			// fact to reorder the indexes so that the first one is the identity index.
-			int swap_idx = -1;
+			// Store the per-texel contribution of this weight for each texel it contributes to
+			di.texel_weight_for_weight[i][j] = 0.0f;
 			for (unsigned int k = 0; k < 4; k++)
 			{
 				uint8_t dttw = di.texel_weights_4t[k][texel];
 				float dttwf = di.texel_weights_float_4t[k][texel];
 				if (dttw == i && dttwf != 0.0f)
 				{
-					swap_idx = k;
+					di.texel_weight_for_weight[i][j] = di.texel_weights_float_4t[k][texel];
+					break;
 				}
-				di.texel_weights_texel[i][j][k] = dttw;
-				di.texel_weights_float_texel[i][j][k] = dttwf;
-			}
-
-			if (swap_idx != 0)
-			{
-				uint8_t vi = di.texel_weights_texel[i][j][0];
-				float vf = di.texel_weights_float_texel[i][j][0];
-				di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
-				di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
-				di.texel_weights_texel[i][j][swap_idx] = vi;
-				di.texel_weights_float_texel[i][j][swap_idx] = vf;
 			}
 		}
 
@@ -628,30 +615,17 @@ static void init_decimation_info_3d(
 			di.weight_texel[j][i] = static_cast<uint8_t>(texel);
 			di.weights_flt[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
 
-			// perform a layer of array unrolling. An aspect of this unrolling is that
-			// one of the texel-weight indexes is an identity-mapped index; we will use this
-			// fact to reorder the indexes so that the first one is the identity index.
-			int swap_idx = -1;
+			// Store the per-texel contribution of this weight for each texel it contributes to
+			di.texel_weight_for_weight[i][j] = 0.0f;
 			for (unsigned int k = 0; k < 4; k++)
 			{
 				uint8_t dttw = di.texel_weights_4t[k][texel];
 				float dttwf = di.texel_weights_float_4t[k][texel];
 				if (dttw == i && dttwf != 0.0f)
 				{
-					swap_idx = k;
+					di.texel_weight_for_weight[i][j] = di.texel_weights_float_4t[k][texel];
+					break;
 				}
-				di.texel_weights_texel[i][j][k] = dttw;
-				di.texel_weights_float_texel[i][j][k] = dttwf;
-			}
-
-			if (swap_idx != 0)
-			{
-				uint8_t vi = di.texel_weights_texel[i][j][0];
-				float vf = di.texel_weights_float_texel[i][j][0];
-				di.texel_weights_texel[i][j][0] = di.texel_weights_texel[i][j][swap_idx];
-				di.texel_weights_float_texel[i][j][0] = di.texel_weights_float_texel[i][j][swap_idx];
-				di.texel_weights_texel[i][j][swap_idx] = vi;
-				di.texel_weights_float_texel[i][j][swap_idx] = vf;
 			}
 		}
 
diff --git a/Source/astcenc_compress_symbolic.cpp b/Source/astcenc_compress_symbolic.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -279,15 +279,12 @@ static bool realign_weights_decimated(
 			{
 				unsigned int texel = di.weight_texel[te_idx][we_idx];
 
-				const uint8_t *texel_weights = di.texel_weights_texel[we_idx][te_idx];
-				const float *texel_weights_float = di.texel_weights_float_texel[we_idx][te_idx];
+				float tw_base = di.texel_weight_for_weight[we_idx][te_idx];
 
-				float tw_base = texel_weights_float[0];
-
-				float weight_base = (uqw_base                      * tw_base
-				                   + uq_weightsf[texel_weights[1]] * texel_weights_float[1])
-				                  + (uq_weightsf[texel_weights[2]] * texel_weights_float[2]
-				                   + uq_weightsf[texel_weights[3]] * texel_weights_float[3]);
+				float weight_base = (uq_weightsf[di.texel_weights_4t[0][texel]] * di.texel_weights_float_4t[0][texel]
+				                   + uq_weightsf[di.texel_weights_4t[1][texel]] * di.texel_weights_float_4t[1][texel])
+					              + (uq_weightsf[di.texel_weights_4t[2][texel]] * di.texel_weights_float_4t[2][texel]
+				                   + uq_weightsf[di.texel_weights_4t[3][texel]] * di.texel_weights_float_4t[3][texel]);
 
 				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
 				// float weight = astc::flt_rd(weight_base + 0.5f);
diff --git a/Source/astcenc_internal.h b/Source/astcenc_internal.h
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2023 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -381,17 +381,8 @@ struct decimation_info
 	/** @brief The list of weight indices that contribute to each texel. */
 	alignas(ASTCENC_VECALIGN) float weights_flt[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
 
-	/**
-	 * @brief Folded structure for faster access:
-	 *     texel_weights_texel[i][j][.] = texel_weights[.][weight_texel[i][j]]
-	 */
-	uint8_t texel_weights_texel[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS][4];
-
-	/**
-	 * @brief Folded structure for faster access:
-	 *     texel_weights_float_texel[i][j][.] = texel_weights_float[.][weight_texel[i][j]]
-	 */
-	float texel_weights_float_texel[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS][4];
+	/** @brief The weight contribution to the total texel weighting for each weight and texel. */
+	float texel_weight_for_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
 };
 
 /**