Vector library cleanup (#473)

solidpixel · web-flow · commit 69bc17b8cd98 · 2024-07-17T15:42:14.000+01:00
The astcenc vector library effectively has two different class APIs:

- A 4-wide API which is used via explicit width types (e.g. vfloat4).
- A vector length agnostic API, which is used via implicit width types 
  (e.g. vfloat) in the codec that are resolved at compile time.

For historical reasons the classes that are only used as VLA types 
(e.g. vfloat8 for AVX2) implement more API than needed because it was
inherited from the original 4-wide implementation. This makes adding new
VLA implementations (e.g. Arm SVE) more expensive than needed.

This PR doesn't add SVE support, but minimizes the VLA API as a 
precursor to doing so. The main changes are:

* Remove VLA indexable .lane&lt;N&gt;() reads.
* Remove VLA float lane_id() factory functions.
* Replace VLA use of .lane&lt;0&gt;() with dedicated functions, e.g. use
  hmax_s() rather than hmax.lane&lt;0&gt;().
diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
@@ -110,7 +110,7 @@ void unpack_weights(
 		{
 			vint summed_value(8);
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
@@ -145,7 +145,7 @@ void unpack_weights(
 			vint sum_plane2(8);
 
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(
 
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
@@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(
 
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
 		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
 		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
 		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
-		vbest_error_index = hmin(vbest_error_index);
-		int best_error_index = vbest_error_index.lane<0>();
+
+		int best_error_index = hmin_s(vbest_error_index);
 
 		best_error_weights[i] = best_error_index;
 
diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
@@ -74,18 +74,6 @@ struct vfloat8
 		m = _mm256_set1_ps(a);
 	}
 
-	/**
-	 * @brief Construct from 8 scalar values.
-	 *
-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
-	 */
-	ASTCENC_SIMD_INLINE explicit vfloat8(
-		float a, float b, float c, float d,
-		float e, float f, float g, float h)
-	{
-		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
-	}
-
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@@ -94,20 +82,6 @@ struct vfloat8
 		m = a;
 	}
 
-	/**
-	 * @brief Get the scalar value of a single lane.
-	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
-	{
-	#if !defined(__clang__) && defined(_MSC_VER)
-		return m.m256_f32[l];
-	#else
-		union { __m256 m; float f[8]; } cvt;
-		cvt.m = m;
-		return cvt.f[l];
-	#endif
-	}
-
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@@ -132,14 +106,6 @@ struct vfloat8
 		return vfloat8(_mm256_load_ps(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
-	{
-		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
-	}
-
 	/**
 	 * @brief The vector ...
 	 */
@@ -190,18 +156,6 @@ struct vint8
 		m = _mm256_set1_epi32(a);
 	}
 
-	/**
-	 * @brief Construct from 8 scalar values.
-	 *
-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
-	 */
-	ASTCENC_SIMD_INLINE explicit vint8(
-		int a, int b, int c, int d,
-		int e, int f, int g, int h)
-	{
-		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
-	}
-
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@@ -210,20 +164,6 @@ struct vint8
 		m = a;
 	}
 
-	/**
-	 * @brief Get the scalar from a single lane.
-	 */
-	template <int l> ASTCENC_SIMD_INLINE int lane() const
-	{
-	#if !defined(__clang__) && defined(_MSC_VER)
-		return m.m256i_i32[l];
-	#else
-		union { __m256i m; int f[8]; } cvt;
-		cvt.m = m;
-		return cvt.f[l];
-	#endif
-	}
-
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
 	return vmin;
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
+{
+	return _mm256_cvtsi256_si32(hmin(a).m);
+}
+
 /**
  * @brief Return the horizontal maximum of a vector.
  */
@@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
 	return vmax;
 }
 
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
+{
+	return _mm256_cvtsi256_si32(hmax(a).m);
+}
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */
@@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
-{
-	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
@@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
 	return a;
 }
 
-/**
- * @brief Return a clamped value between 0.0f and max.
- *
- * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
-{
-	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
-	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
-	return a;
-}
-
 /**
  * @brief Return a clamped value between 0.0f and 1.0f.
  *
@@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
 {
-	return hmin(a).lane<0>();
+	return _mm256_cvtss_f32(hmin(a).m);
 }
 
 /**
@@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
 {
-	return hmax(a).lane<0>();
+	return _mm256_cvtss_f32(hmax(a).m);
 }
 
 /**
@@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
  * @brief Return a vector of interleaved RGBA data.
  *
  * Input vectors have the value stored in the bottom 8 bits of each lane,
- * with high  bits set to zero.
+ * with high bits set to zero.
  *
  * Output vector stores a single RGBA texel packed in each lane.
  */
diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h
@@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
+{
+	return hmax(a).lane<0>();
+}
+
 // ============================================================================
 // vfloat4 operators and functions
 // ============================================================================
@@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
 	return min(max(a, minv), maxv);
 }
 
-/**
- * @brief Return the clamped value between 0.0f and max.
- *
- * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
-{
-	// Do not reorder - second operand will return if either is NaN
-	return min(max(a, vfloat4::zero()), maxv);
-}
-
 /**
  * @brief Return the clamped value between 0.0f and 1.0f.
  *
diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
@@ -134,15 +134,6 @@ struct vfloat4
 		return vfloat4(vld1q_f32(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
-		return vfloat4(vld1q_f32(data));
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-	alignas(16) int idx[4];
-	storea(indices, idx);
-	alignas(16) int vals[4];
-	vals[0] = base[idx[0]];
-	vals[1] = base[idx[1]];
-	vals[2] = base[idx[2]];
-	vals[3] = base[idx[3]];
-	return vint4(vals);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
@@ -139,14 +139,6 @@ struct vfloat4
 		return vfloat4(p);
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	std::memcpy(p, a.m, sizeof(uint8_t) * 4);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-	return vint4(base[indices.m[0]],
-	             base[indices.m[1]],
-	             base[indices.m[2]],
-	             base[indices.m[3]]);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
@@ -142,14 +142,6 @@ struct vfloat4
 		return vfloat4(_mm_load_ps(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		return vfloat4(_mm_set_ps(3, 2, 1, 0));
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-#if ASTCENC_AVX >= 2
-	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
-#else
-	alignas(16) int idx[4];
-	storea(indices, idx);
-	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
-#endif
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
diff --git a/Source/astcenc_weight_align.cpp b/Source/astcenc_weight_align.cpp
@@ -164,7 +164,7 @@ static void compute_lowest_and_highest_weight(
 	promise(weight_count > 0);
 	promise(max_angular_steps > 0);
 
-	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+	vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
 
 	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
 	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)