Skip to content

Commit d1a7bda

Browse files
authored
Implement vec4 support for float16 conversions (#214)
This PR implements the ability to convert floats stored in vfloat4 vectors to fp16 bit patterns stored in vint4 vectors. Scalar versions are also provided as overloaded functions of the same name, for sake of convenience. The code has been refactored to take advantage of the vectorization opportunities this gives, in particular using NEON (Arm) and F16C (x86-64) ISA support for float<>fp16 conversion. The F16C support is tied to the AVX2 enable config option, it is not a separate enable. In cases where the soft-float code is not needed, code size reduces by ~5KB.
1 parent 05a435d commit d1a7bda

20 files changed

Lines changed: 501 additions & 246 deletions

README.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -70,20 +70,22 @@ Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
7070
## astcenc 2.x binaries
7171

7272
The current builds of the astcenc 2.x series are provided as multiple binaries,
73-
each tuned for a specific SIMD instruction set. We provide, in order of
74-
increasing performance:
73+
each tuned for a specific SIMD instruction set.
74+
75+
For x86-64 we provide, in order of increasing performance:
7576

7677
* `astcenc-sse2` - uses SSE2
7778
* `astcenc-sse4.1` - uses SSE4.1 and POPCNT
78-
* `astcenc-avx2` - uses SSE4.2, POPCNT, and AVX2
79+
* `astcenc-avx2` - uses AVX2, SSE4.2, POPCNT, and F16C
80+
81+
For Apple silicon macOS devices we provide:
82+
83+
* `astcenc-neon` - uses NEON
7984

80-
The SSE2 builds will work on all x86-64 host machines, but it is the slowest of
81-
the three. The other two require extended CPU instruction set support which is
82-
not universally available.
85+
The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
86+
of the three. The other two require extended CPU instruction set support which
87+
is not universally available, but each step gains ~15% more performance.
8388

84-
It is worth noting that the three binaries do not produce identical output
85-
images; there are minor output differences caused by variations in
86-
floating-point rounding.
8789

8890
## Repository branches
8991

Source/UnitTest/cmake_core.cmake

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ add_executable(test-simd-${ISA_SIMD})
1919

2020
target_sources(test-simd-${ISA_SIMD}
2121
PRIVATE
22-
test_simd.cpp)
22+
test_simd.cpp
23+
../astcenc_mathlib_softfloat.cpp)
2324

2425
target_include_directories(test-simd-${ISA_SIMD}
2526
PRIVATE
@@ -48,7 +49,8 @@ if(${ISA_SIMD} MATCHES "none")
4849
ASTCENC_NEON=0
4950
ASTCENC_SSE=0
5051
ASTCENC_AVX=0
51-
ASTCENC_POPCNT=0)
52+
ASTCENC_POPCNT=0
53+
ASTCENC_F16C=0)
5254

5355
if (${ARCH} MATCHES x64)
5456
target_compile_options(test-simd-${ISA_SIMD}
@@ -62,15 +64,17 @@ elseif(${ISA_SIMD} MATCHES "neon")
6264
ASTCENC_NEON=1
6365
ASTCENC_SSE=0
6466
ASTCENC_AVX=0
65-
ASTCENC_POPCNT=0)
67+
ASTCENC_POPCNT=0
68+
ASTCENC_F16C=0)
6669

6770
elseif(${ISA_SIMD} MATCHES "sse2")
6871
target_compile_definitions(test-simd-${ISA_SIMD}
6972
PRIVATE
7073
ASTCENC_NEON=0
7174
ASTCENC_SSE=20
7275
ASTCENC_AVX=0
73-
ASTCENC_POPCNT=0)
76+
ASTCENC_POPCNT=0
77+
ASTCENC_F16C=0)
7478

7579
target_compile_options(test-simd-${ISA_SIMD}
7680
PRIVATE
@@ -82,7 +86,8 @@ elseif(${ISA_SIMD} MATCHES "sse4.1")
8286
ASTCENC_NEON=0
8387
ASTCENC_SSE=41
8488
ASTCENC_AVX=0
85-
ASTCENC_POPCNT=1)
89+
ASTCENC_POPCNT=1
90+
ASTCENC_F16C=0)
8691

8792
target_compile_options(test-simd-${ISA_SIMD}
8893
PRIVATE
@@ -94,11 +99,12 @@ elseif(${ISA_SIMD} MATCHES "avx2")
9499
ASTCENC_NEON=0
95100
ASTCENC_SSE=41
96101
ASTCENC_AVX=2
97-
ASTCENC_POPCNT=1)
102+
ASTCENC_POPCNT=1
103+
ASTCENC_F16C=1)
98104

99105
target_compile_options(test-simd-${ISA_SIMD}
100106
PRIVATE
101-
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt>
107+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt -mf16c>
102108
$<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
103109
endif()
104110

Source/UnitTest/test_simd.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,53 @@ TEST(vfloat4, int_to_float)
10141014
EXPECT_EQ(r.lane<3>(), 4.0f);
10151015
}
10161016

1017+
/** @brief Test vfloat4 float to fp16 conversion. */
1018+
TEST(vfloat4, float_to_float16)
1019+
{
1020+
vfloat4 a(1.5, 234.5, 345345.0, qnan);
1021+
vint4 r = float_to_float16(a);
1022+
1023+
// Normal numbers
1024+
EXPECT_EQ(r.lane<0>(), 0x3E00);
1025+
EXPECT_EQ(r.lane<1>(), 0x5B54);
1026+
1027+
// Large numbers convert to infinity
1028+
EXPECT_EQ(r.lane<2>(), 0x7C00);
1029+
1030+
// NaN must convert to any valid NaN encoding
1031+
EXPECT_EQ((r.lane<3>() >> 10) & 0x1F, 0x1F); // Exponent must be all 1s
1032+
EXPECT_NE(r.lane<3>() & (0x3FF), 0); // Mantissa must be non-zero
1033+
}
1034+
1035+
/** @brief Test float to fp16 conversion. */
1036+
TEST(sfloat, float_to_float16)
1037+
{
1038+
int r = float_to_float16(234.5);
1039+
EXPECT_EQ(r, 0x5B54);
1040+
}
1041+
1042+
/** @brief Test vfloat4 fp16 to float conversion. */
1043+
TEST(vfloat4, float16_to_float)
1044+
{ vint4 a(0x3E00, 0x5B54, 0x7C00, 0xFFFF);
1045+
vfloat4 r = float16_to_float(a);
1046+
1047+
// Normal numbers
1048+
EXPECT_EQ(r.lane<0>(), 1.5);
1049+
EXPECT_EQ(r.lane<1>(), 234.5);
1050+
1051+
// Infinities must be preserved
1052+
EXPECT_NE(std::isinf(r.lane<2>()), 0);
1053+
1054+
// NaNs must be preserved
1055+
EXPECT_NE(std::isnan(r.lane<3>()), 0);
1056+
}
1057+
1058+
/** @brief Test fp16 to float conversion. */
1059+
TEST(sfloat, float16_to_float)
1060+
{
1061+
float r = float16_to_float(0x5B54);
1062+
EXPECT_EQ(r, 234.5);
1063+
}
10171064

10181065
// VINT4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
10191066

Source/astcenc_compress_symbolic.cpp

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,27 +1308,22 @@ void compress_block(
13081308

13091309
// detected a constant-color block. Encode as FP16 if using HDR
13101310
scb.error_block = 0;
1311+
scb.partition_count = 0;
13111312

13121313
if ((decode_mode == ASTCENC_PRF_HDR) ||
13131314
(decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
13141315
{
13151316
scb.block_mode = -1;
1316-
scb.partition_count = 0;
1317-
vfloat4 orig_color = blk->origin_texel;
1318-
scb.constant_color[0] = float_to_sf16(orig_color.lane<0>(), SF_NEARESTEVEN);
1319-
scb.constant_color[1] = float_to_sf16(orig_color.lane<1>(), SF_NEARESTEVEN);
1320-
scb.constant_color[2] = float_to_sf16(orig_color.lane<2>(), SF_NEARESTEVEN);
1321-
scb.constant_color[3] = float_to_sf16(orig_color.lane<3>(), SF_NEARESTEVEN);
1317+
vint4 color_f16 = float_to_float16(blk->origin_texel);
1318+
store(color_f16, scb.constant_color);
13221319
}
13231320
else
13241321
{
13251322
// Encode as UNORM16 if NOT using HDR.
13261323
scb.block_mode = -2;
1327-
scb.partition_count = 0;
1328-
1329-
vfloat4 color_u16f = clamp(0.0f, 1.0f, blk->origin_texel) * 65535.0f;
1330-
vint4 color_u16i = float_to_int_rtn(color_u16f);
1331-
store(color_u16i, scb.constant_color);
1324+
vfloat4 color_f32 = clamp(0.0f, 1.0f, blk->origin_texel) * 65535.0f;
1325+
vint4 color_u16 = float_to_int_rtn(color_f32);
1326+
store(color_u16, scb.constant_color);
13321327
}
13331328

13341329
trace_add_data("exit", "quality hit");

Source/astcenc_compute_variance.cpp

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -245,15 +245,8 @@ static void compute_pixel_region_variance(
245245
data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
246246
data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
247247

248-
uint16_t r = data[swz.r];
249-
uint16_t g = data[swz.g];
250-
uint16_t b = data[swz.b];
251-
uint16_t a = data[swz.a];
252-
253-
vfloat4 d = vfloat4(sf16_to_float(r),
254-
sf16_to_float(g),
255-
sf16_to_float(b),
256-
sf16_to_float(a));
248+
vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
249+
vfloat4 d = float16_to_float(di);
257250

258251
if (!are_powers_1)
259252
{

Source/astcenc_decompress_symbolic.cpp

Lines changed: 20 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ void decompress_symbolic_block(
8383
// if we detected an error-block, blow up immediately.
8484
if (scb->error_block)
8585
{
86+
// TODO: Check this - isn't linear LDR magenta too? Same below ...
8687
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
8788
{
8889
for (int i = 0; i < bsd->texel_count; i++)
@@ -115,74 +116,58 @@ void decompress_symbolic_block(
115116

116117
if (scb->block_mode < 0)
117118
{
118-
float red = 0, green = 0, blue = 0, alpha = 0;
119+
vfloat4 color;
119120
int use_lns = 0;
120121
int use_nan = 0;
121122

122123
if (scb->block_mode == -2)
123124
{
124-
int ired = scb->constant_color[0];
125-
int igreen = scb->constant_color[1];
126-
int iblue = scb->constant_color[2];
127-
int ialpha = scb->constant_color[3];
125+
vint4 colori(scb->constant_color);
128126

129127
// For sRGB decoding a real decoder would just use the top 8 bits
130128
// for color conversion. We don't color convert, so linearly scale
131129
// the top 8 bits into the full 16 bit dynamic range
132130
if (decode_mode == ASTCENC_PRF_LDR_SRGB)
133131
{
134-
ired = (ired >> 8) * 257;
135-
igreen = (igreen >> 8) * 257;
136-
iblue = (iblue >> 8) * 257;
137-
ialpha = (ialpha >> 8) * 257;
132+
colori = lsr<8>(colori) * 257;
138133
}
139134

140-
red = sf16_to_float(unorm16_to_sf16(ired));
141-
green = sf16_to_float(unorm16_to_sf16(igreen));
142-
blue = sf16_to_float(unorm16_to_sf16(iblue));
143-
alpha = sf16_to_float(unorm16_to_sf16(ialpha));
144-
use_lns = 0;
145-
use_nan = 0;
135+
vint4 colorf16(
136+
unorm16_to_sf16(colori.lane<0>()),
137+
unorm16_to_sf16(colori.lane<1>()),
138+
unorm16_to_sf16(colori.lane<2>()),
139+
unorm16_to_sf16(colori.lane<3>())
140+
);
141+
142+
color = float16_to_float(colorf16);
146143
}
147144
else
148145
{
149146
switch (decode_mode)
150147
{
151148
case ASTCENC_PRF_LDR_SRGB:
152-
red = 1.0f;
153-
green = 0.0f;
154-
blue = 1.0f;
155-
alpha = 1.0f;
156-
use_lns = 0;
157-
use_nan = 0;
149+
color = vfloat4(1.0f, 0.0f, 1.0f, 1.0f);
158150
break;
159151
case ASTCENC_PRF_LDR:
160-
red = 0.0f;
161-
green = 0.0f;
162-
blue = 0.0f;
163-
alpha = 0.0f;
164-
use_lns = 0;
152+
color = vfloat4(0.0f);
165153
use_nan = 1;
166154
break;
167155
case ASTCENC_PRF_HDR_RGB_LDR_A:
168156
case ASTCENC_PRF_HDR:
169157
// constant-color block; unpack from FP16 to FP32.
170-
red = sf16_to_float(scb->constant_color[0]);
171-
green = sf16_to_float(scb->constant_color[1]);
172-
blue = sf16_to_float(scb->constant_color[2]);
173-
alpha = sf16_to_float(scb->constant_color[3]);
158+
color = float16_to_float(vint4(scb->constant_color));
174159
use_lns = 1;
175-
use_nan = 0;
176160
break;
177161
}
178162
}
179163

164+
// TODO: Skip this and add constant color transfer to img block?
180165
for (int i = 0; i < bsd->texel_count; i++)
181166
{
182-
blk->data_r[i] = red;
183-
blk->data_g[i] = green;
184-
blk->data_b[i] = blue;
185-
blk->data_a[i] = alpha;
167+
blk->data_r[i] = color.lane<0>();
168+
blk->data_g[i] = color.lane<1>();
169+
blk->data_b[i] = color.lane<2>();
170+
blk->data_a[i] = color.lane<3>();
186171
blk->rgb_lns[i] = use_lns;
187172
blk->alpha_lns[i] = use_lns;
188173
blk->nan_texel[i] = use_nan;

Source/astcenc_entry.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@ static astcenc_error validate_cpu_isa()
6464
}
6565
#endif
6666

67+
#if ASTCENC_F16C >= 1
68+
if (!cpu_supports_f16c())
69+
{
70+
return ASTCENC_ERR_BAD_CPU_ISA;
71+
}
72+
#endif
73+
6774
#if ASTCENC_AVX >= 2
6875
if (!cpu_supports_avx2())
6976
{

0 commit comments

Comments
 (0)