Skip to content

Commit 69bc17b

Browse files
authored
Vector library cleanup (#473)
The astcenc vector library effectively has two different class APIs: - A 4-wide API which is used via explicit width types (e.g. vfloat4). - A vector length agnostic API, which is used via implicit width types (e.g. vfloat) in the codec that are resolved at compile time. For historical reasons the classes that are only used as VLA types (e.g. vfloat8 for AVX2) implement more API than needed because it was inherited from the original 4-wide implementation. This makes adding new VLA implementations (e.g. Arm SVE) more expensive than needed. This PR doesn't add SVE support, but minimizes the VLA API as a precursor to doing so. The main changes are: * Remove VLA indexable .lane<N>() reads. * Remove VLA float lane_id() factory functions. * Replace VLA use of .lane<0>() with dedicated functions, e.g. use hmax_s() rather than hmax.lane<0>().
1 parent ffdc45e commit 69bc17b

10 files changed

Lines changed: 1123 additions & 1029 deletions

Source/UnitTest/test_simd.cpp

Lines changed: 1080 additions & 860 deletions
Large diffs are not rendered by default.

Source/astcenc_decompress_symbolic.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ void unpack_weights(
110110
{
111111
vint summed_value(8);
112112
vint weight_count(di.texel_weight_count + i);
113-
int max_weight_count = hmax(weight_count).lane<0>();
113+
int max_weight_count = hmax_s(weight_count);
114114

115115
promise(max_weight_count > 0);
116116
for (int j = 0; j < max_weight_count; j++)
@@ -145,7 +145,7 @@ void unpack_weights(
145145
vint sum_plane2(8);
146146

147147
vint weight_count(di.texel_weight_count + i);
148-
int max_weight_count = hmax(weight_count).lane<0>();
148+
int max_weight_count = hmax_s(weight_count);
149149

150150
promise(max_weight_count > 0);
151151
for (int j = 0; j < max_weight_count; j++)

Source/astcenc_ideal_endpoints_and_weights.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(
889889

890890
// Accumulate error weighting of all the texels using this weight
891891
vint weight_texel_count(di.weight_texel_count + i);
892-
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
892+
unsigned int max_texel_count = hmax_s(weight_texel_count);
893893
promise(max_texel_count > 0);
894894

895895
for (unsigned int j = 0; j < max_texel_count; j++)
@@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(
947947

948948
// Accumulate error weighting of all the texels using this weight
949949
vint weight_texel_count(di.weight_texel_count + i);
950-
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
950+
unsigned int max_texel_count = hmax_s(weight_texel_count);
951951
promise(max_texel_count > 0);
952952

953953
for (unsigned int j = 0; j < max_texel_count; j++)

Source/astcenc_pick_best_endpoint_format.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// SPDX-License-Identifier: Apache-2.0
22
// ----------------------------------------------------------------------------
3-
// Copyright 2011-2022 Arm Limited
3+
// Copyright 2011-2024 Arm Limited
44
//
55
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
66
// use this file except in compliance with the License. You may obtain a copy
@@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
13061306
// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
13071307
vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
13081308
vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
1309-
vbest_error_index = hmin(vbest_error_index);
1310-
int best_error_index = vbest_error_index.lane<0>();
1309+
1310+
int best_error_index = hmin_s(vbest_error_index);
13111311

13121312
best_error_weights[i] = best_error_index;
13131313

Source/astcenc_vecmathlib_avx2_8.h

Lines changed: 19 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -74,18 +74,6 @@ struct vfloat8
7474
m = _mm256_set1_ps(a);
7575
}
7676

77-
/**
78-
* @brief Construct from 8 scalar values.
79-
*
80-
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
81-
*/
82-
ASTCENC_SIMD_INLINE explicit vfloat8(
83-
float a, float b, float c, float d,
84-
float e, float f, float g, float h)
85-
{
86-
m = _mm256_set_ps(h, g, f, e, d, c, b, a);
87-
}
88-
8977
/**
9078
* @brief Construct from an existing SIMD register.
9179
*/
@@ -94,20 +82,6 @@ struct vfloat8
9482
m = a;
9583
}
9684

97-
/**
98-
* @brief Get the scalar value of a single lane.
99-
*/
100-
template <int l> ASTCENC_SIMD_INLINE float lane() const
101-
{
102-
#if !defined(__clang__) && defined(_MSC_VER)
103-
return m.m256_f32[l];
104-
#else
105-
union { __m256 m; float f[8]; } cvt;
106-
cvt.m = m;
107-
return cvt.f[l];
108-
#endif
109-
}
110-
11185
/**
11286
* @brief Factory that returns a vector of zeros.
11387
*/
@@ -132,14 +106,6 @@ struct vfloat8
132106
return vfloat8(_mm256_load_ps(p));
133107
}
134108

135-
/**
136-
* @brief Factory that returns a vector containing the lane IDs.
137-
*/
138-
static ASTCENC_SIMD_INLINE vfloat8 lane_id()
139-
{
140-
return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
141-
}
142-
143109
/**
144110
* @brief The vector ...
145111
*/
@@ -190,18 +156,6 @@ struct vint8
190156
m = _mm256_set1_epi32(a);
191157
}
192158

193-
/**
194-
* @brief Construct from 8 scalar values.
195-
*
196-
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
197-
*/
198-
ASTCENC_SIMD_INLINE explicit vint8(
199-
int a, int b, int c, int d,
200-
int e, int f, int g, int h)
201-
{
202-
m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
203-
}
204-
205159
/**
206160
* @brief Construct from an existing SIMD register.
207161
*/
@@ -210,20 +164,6 @@ struct vint8
210164
m = a;
211165
}
212166

213-
/**
214-
* @brief Get the scalar from a single lane.
215-
*/
216-
template <int l> ASTCENC_SIMD_INLINE int lane() const
217-
{
218-
#if !defined(__clang__) && defined(_MSC_VER)
219-
return m.m256i_i32[l];
220-
#else
221-
union { __m256i m; int f[8]; } cvt;
222-
cvt.m = m;
223-
return cvt.f[l];
224-
#endif
225-
}
226-
227167
/**
228168
* @brief Factory that returns a vector of zeros.
229169
*/
@@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
528468
return vmin;
529469
}
530470

471+
/**
472+
* @brief Return the horizontal minimum of a vector.
473+
*/
474+
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
475+
{
476+
return _mm256_cvtsi256_si32(hmin(a).m);
477+
}
478+
531479
/**
532480
* @brief Return the horizontal maximum of a vector.
533481
*/
@@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
543491
return vmax;
544492
}
545493

494+
/**
495+
* @brief Return the horizontal maximum of a vector.
496+
*/
497+
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
498+
{
499+
return _mm256_cvtsi256_si32(hmax(a).m);
500+
}
501+
546502
/**
547503
* @brief Store a vector to a 16B aligned memory address.
548504
*/
@@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
570526
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
571527
}
572528

573-
/**
574-
* @brief Gather N (vector width) indices from the array.
575-
*/
576-
ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
577-
{
578-
return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
579-
}
580-
581529
/**
582530
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
583531
*/
@@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
786734
return a;
787735
}
788736

789-
/**
790-
* @brief Return a clamped value between 0.0f and max.
791-
*
792-
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
793-
* be returned for that lane.
794-
*/
795-
ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
796-
{
797-
a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
798-
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
799-
return a;
800-
}
801-
802737
/**
803738
* @brief Return a clamped value between 0.0f and 1.0f.
804739
*
@@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
857792
*/
858793
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
859794
{
860-
return hmin(a).lane<0>();
795+
return _mm256_cvtss_f32(hmin(a).m);
861796
}
862797

863798
/**
@@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
887822
*/
888823
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
889824
{
890-
return hmax(a).lane<0>();
825+
return _mm256_cvtss_f32(hmax(a).m);
891826
}
892827

893828
/**
@@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
11461081
* @brief Return a vector of interleaved RGBA data.
11471082
*
11481083
* Input vectors have the value stored in the bottom 8 bits of each lane,
1149-
* with high bits set to zero.
1084+
* with high bits set to zero.
11501085
*
11511086
* Output vector stores a single RGBA texel packed in each lane.
11521087
*/

Source/astcenc_vecmathlib_common_4.h

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
129129
return a.lane<0>() + a.lane<1>() + a.lane<2>();
130130
}
131131

132+
/**
133+
* @brief Return the horizontal minimum of a vector.
134+
*/
135+
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
136+
{
137+
return hmin(a).lane<0>();
138+
}
139+
140+
/**
141+
* @brief Return the horizontal maximum of a vector.
142+
*/
143+
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
144+
{
145+
return hmax(a).lane<0>();
146+
}
147+
132148
// ============================================================================
133149
// vfloat4 operators and functions
134150
// ============================================================================
@@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
222238
return min(max(a, minv), maxv);
223239
}
224240

225-
/**
226-
* @brief Return the clamped value between 0.0f and max.
227-
*
228-
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
229-
* be returned for that lane.
230-
*/
231-
ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
232-
{
233-
// Do not reorder - second operand will return if either is NaN
234-
return min(max(a, vfloat4::zero()), maxv);
235-
}
236-
237241
/**
238242
* @brief Return the clamped value between 0.0f and 1.0f.
239243
*

Source/astcenc_vecmathlib_neon_4.h

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,6 @@ struct vfloat4
134134
return vfloat4(vld1q_f32(p));
135135
}
136136

137-
/**
138-
* @brief Factory that returns a vector containing the lane IDs.
139-
*/
140-
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
141-
{
142-
alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
143-
return vfloat4(vld1q_f32(data));
144-
}
145-
146137
/**
147138
* @brief Return a swizzled float 2.
148139
*/
@@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
611602
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
612603
}
613604

614-
/**
615-
* @brief Gather N (vector width) indices from the array.
616-
*/
617-
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
618-
{
619-
alignas(16) int idx[4];
620-
storea(indices, idx);
621-
alignas(16) int vals[4];
622-
vals[0] = base[idx[0]];
623-
vals[1] = base[idx[1]];
624-
vals[2] = base[idx[2]];
625-
vals[3] = base[idx[3]];
626-
return vint4(vals);
627-
}
628-
629605
/**
630606
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
631607
*/

Source/astcenc_vecmathlib_none_4.h

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,6 @@ struct vfloat4
139139
return vfloat4(p);
140140
}
141141

142-
/**
143-
* @brief Factory that returns a vector containing the lane IDs.
144-
*/
145-
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
146-
{
147-
return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
148-
}
149-
150142
/**
151143
* @brief Return a swizzled float 2.
152144
*/
@@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
684676
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
685677
}
686678

687-
/**
688-
* @brief Gather N (vector width) indices from the array.
689-
*/
690-
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
691-
{
692-
return vint4(base[indices.m[0]],
693-
base[indices.m[1]],
694-
base[indices.m[2]],
695-
base[indices.m[3]]);
696-
}
697-
698679
/**
699680
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
700681
*/

Source/astcenc_vecmathlib_sse_4.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -142,14 +142,6 @@ struct vfloat4
142142
return vfloat4(_mm_load_ps(p));
143143
}
144144

145-
/**
146-
* @brief Factory that returns a vector containing the lane IDs.
147-
*/
148-
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
149-
{
150-
return vfloat4(_mm_set_ps(3, 2, 1, 0));
151-
}
152-
153145
/**
154146
* @brief Return a swizzled float 2.
155147
*/
@@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
663655
_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
664656
}
665657

666-
/**
667-
* @brief Gather N (vector width) indices from the array.
668-
*/
669-
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
670-
{
671-
#if ASTCENC_AVX >= 2
672-
return vint4(_mm_i32gather_epi32(base, indices.m, 4));
673-
#else
674-
alignas(16) int idx[4];
675-
storea(indices, idx);
676-
return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
677-
#endif
678-
}
679-
680658
/**
681659
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
682660
*/

Source/astcenc_weight_align.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ static void compute_lowest_and_highest_weight(
164164
promise(weight_count > 0);
165165
promise(max_angular_steps > 0);
166166

167-
vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
167+
vfloat rcp_stepsize = int_to_float(vint::lane_id()) + vfloat(1.0f);
168168

169169
// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
170170
for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)

0 commit comments

Comments
 (0)