ARM-software
diff --git a/‎README.md‎
Lines changed: 11 additions & 9 deletions b/‎README.md‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎Source/UnitTest/cmake_core.cmake‎
Lines changed: 13 additions & 7 deletions b/‎Source/UnitTest/cmake_core.cmake‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎Source/UnitTest/test_simd.cpp‎
Lines changed: 47 additions & 0 deletions b/‎Source/UnitTest/test_simd.cpp‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎Source/astcenc_compress_symbolic.cpp‎
Lines changed: 6 additions & 11 deletions b/‎Source/astcenc_compress_symbolic.cpp‎
Lines changed: 6 additions & 11 deletions
diff --git a/‎Source/astcenc_compute_variance.cpp‎
Lines changed: 2 additions & 9 deletions b/‎Source/astcenc_compute_variance.cpp‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎Source/astcenc_decompress_symbolic.cpp‎
Lines changed: 20 additions & 35 deletions b/‎Source/astcenc_decompress_symbolic.cpp‎
Lines changed: 20 additions & 35 deletions
diff --git a/‎Source/astcenc_entry.cpp‎
Lines changed: 7 additions & 0 deletions b/‎Source/astcenc_entry.cpp‎
Lines changed: 7 additions & 0 deletions
@@ -70,20 +70,22 @@ Binaries are provided for 64-bit builds on Windows, macOS, and Linux.
 ## astcenc 2.x binaries
 
 The current builds of the astcenc 2.x series are provided as multiple binaries,
-each tuned for a specific SIMD instruction set. We provide, in order of
-increasing performance:
+each tuned for a specific SIMD instruction set.
+
+For x86-64 we provide, in order of increasing performance:
 
 * `astcenc-sse2` - uses SSE2
 * `astcenc-sse4.1` - uses SSE4.1 and POPCNT
-* `astcenc-avx2` - uses SSE4.2, POPCNT, and AVX2
+* `astcenc-avx2` - uses AVX2, SSE4.2, POPCNT, and F16C
+
+For Apple silicon macOS devices we provide:
+
+* `astcenc-neon` - uses NEON
 
-The SSE2 builds will work on all x86-64 host machines, but it is the slowest of
-the three. The other two require extended CPU instruction set support which is
-not universally available.
+The x86-64 SSE2 builds will work on all x86-64 machines, but it is the slowest
+of the three. The other two require extended CPU instruction set support which
+is not universally available, but each step gains ~15% more performance.
 
-It is worth noting that the three binaries do not produce identical output
-images; there are minor output differences caused by variations in
-floating-point rounding.
 
 ## Repository branches
 
 
@@ -19,7 +19,8 @@ add_executable(test-simd-${ISA_SIMD})
 
 target_sources(test-simd-${ISA_SIMD}
     PRIVATE
-        test_simd.cpp)
+        test_simd.cpp
+        ../astcenc_mathlib_softfloat.cpp)
 
 target_include_directories(test-simd-${ISA_SIMD}
     PRIVATE
@@ -48,7 +49,8 @@ if(${ISA_SIMD} MATCHES "none")
             ASTCENC_NEON=0
             ASTCENC_SSE=0
             ASTCENC_AVX=0
-            ASTCENC_POPCNT=0)
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
 
     if (${ARCH} MATCHES x64)
         target_compile_options(test-simd-${ISA_SIMD}
@@ -62,15 +64,17 @@ elseif(${ISA_SIMD} MATCHES "neon")
             ASTCENC_NEON=1
             ASTCENC_SSE=0
             ASTCENC_AVX=0
-            ASTCENC_POPCNT=0)
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
 
 elseif(${ISA_SIMD} MATCHES "sse2")
     target_compile_definitions(test-simd-${ISA_SIMD}
         PRIVATE
             ASTCENC_NEON=0
             ASTCENC_SSE=20
             ASTCENC_AVX=0
-            ASTCENC_POPCNT=0)
+            ASTCENC_POPCNT=0
+            ASTCENC_F16C=0)
 
     target_compile_options(test-simd-${ISA_SIMD}
         PRIVATE
@@ -82,7 +86,8 @@ elseif(${ISA_SIMD} MATCHES "sse4.1")
             ASTCENC_NEON=0
             ASTCENC_SSE=41
             ASTCENC_AVX=0
-            ASTCENC_POPCNT=1)
+            ASTCENC_POPCNT=1
+            ASTCENC_F16C=0)
 
     target_compile_options(test-simd-${ISA_SIMD}
         PRIVATE
@@ -94,11 +99,12 @@ elseif(${ISA_SIMD} MATCHES "avx2")
             ASTCENC_NEON=0
             ASTCENC_SSE=41
             ASTCENC_AVX=2
-            ASTCENC_POPCNT=1)
+            ASTCENC_POPCNT=1
+            ASTCENC_F16C=1)
 
     target_compile_options(test-simd-${ISA_SIMD}
         PRIVATE
-            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt>
+            $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-mfpmath=sse -mavx2 -mpopcnt -mf16c>
             $<$<CXX_COMPILER_ID:MSVC>:/arch:AVX2>)
 endif()
 
 
@@ -1014,6 +1014,53 @@ TEST(vfloat4, int_to_float)
 	EXPECT_EQ(r.lane<3>(), 4.0f);
 }
 
+/** @brief Test vfloat4 float to fp16 conversion. */
+TEST(vfloat4, float_to_float16)
+{
+	vfloat4 a(1.5, 234.5, 345345.0, qnan);
+	vint4 r = float_to_float16(a);
+
+	// Normal numbers
+	EXPECT_EQ(r.lane<0>(), 0x3E00);
+	EXPECT_EQ(r.lane<1>(), 0x5B54);
+
+	// Large numbers convert to infinity
+	EXPECT_EQ(r.lane<2>(), 0x7C00);
+
+	// NaN must convert to any valid NaN encoding
+	EXPECT_EQ((r.lane<3>() >> 10) & 0x1F, 0x1F); // Exponent must be all 1s
+	EXPECT_NE(r.lane<3>() & (0x3FF), 0);         // Mantissa must be non-zero
+}
+
+/** @brief Test float to fp16 conversion. */
+TEST(sfloat, float_to_float16)
+{
+	int r = float_to_float16(234.5);
+	EXPECT_EQ(r, 0x5B54);
+}
+
+/** @brief Test vfloat4 fp16 to float conversion. */
+TEST(vfloat4, float16_to_float)
+{	vint4 a(0x3E00, 0x5B54, 0x7C00, 0xFFFF);
+	vfloat4 r = float16_to_float(a);
+
+	// Normal numbers
+	EXPECT_EQ(r.lane<0>(), 1.5);
+	EXPECT_EQ(r.lane<1>(), 234.5);
+
+	// Infinities must be preserved
+	EXPECT_NE(std::isinf(r.lane<2>()), 0);
+
+	// NaNs must be preserved
+	EXPECT_NE(std::isnan(r.lane<3>()), 0);
+}
+
+/** @brief Test fp16 to float conversion. */
+TEST(sfloat, float16_to_float)
+{
+	float r = float16_to_float(0x5B54);
+	EXPECT_EQ(r, 234.5);
+}
 
 // VINT4 tests - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
 
@@ -1308,27 +1308,22 @@ void compress_block(
 
 		// detected a constant-color block. Encode as FP16 if using HDR
 		scb.error_block = 0;
+		scb.partition_count = 0;
 
 		if ((decode_mode == ASTCENC_PRF_HDR) ||
 		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
 		{
 			scb.block_mode = -1;
-			scb.partition_count = 0;
-			vfloat4 orig_color = blk->origin_texel;
-			scb.constant_color[0] = float_to_sf16(orig_color.lane<0>(), SF_NEARESTEVEN);
-			scb.constant_color[1] = float_to_sf16(orig_color.lane<1>(), SF_NEARESTEVEN);
-			scb.constant_color[2] = float_to_sf16(orig_color.lane<2>(), SF_NEARESTEVEN);
-			scb.constant_color[3] = float_to_sf16(orig_color.lane<3>(), SF_NEARESTEVEN);
+			vint4 color_f16 = float_to_float16(blk->origin_texel);
+			store(color_f16, scb.constant_color);
 		}
 		else
 		{
 			// Encode as UNORM16 if NOT using HDR.
 			scb.block_mode = -2;
-			scb.partition_count = 0;
-
-			vfloat4 color_u16f = clamp(0.0f, 1.0f, blk->origin_texel) * 65535.0f;
-			vint4 color_u16i = float_to_int_rtn(color_u16f);
-			store(color_u16i, scb.constant_color);
+			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk->origin_texel) * 65535.0f;
+			vint4 color_u16 = float_to_int_rtn(color_f32);
+			store(color_u16, scb.constant_color);
 		}
 
 		trace_add_data("exit", "quality hit");
 
@@ -245,15 +245,8 @@ static void compute_pixel_region_variance(
 					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
 					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
 
-					uint16_t r = data[swz.r];
-					uint16_t g = data[swz.g];
-					uint16_t b = data[swz.b];
-					uint16_t a = data[swz.a];
-
-					vfloat4 d = vfloat4(sf16_to_float(r),
-					                    sf16_to_float(g),
-					                    sf16_to_float(b),
-					                    sf16_to_float(a));
+					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					vfloat4 d = float16_to_float(di);
 
 					if (!are_powers_1)
 					{
 
@@ -83,6 +83,7 @@ void decompress_symbolic_block(
 	// if we detected an error-block, blow up immediately.
 	if (scb->error_block)
 	{
+		// TODO: Check this - isn't linear LDR magenta too? Same below ...
 		if (decode_mode == ASTCENC_PRF_LDR_SRGB)
 		{
 			for (int i = 0; i < bsd->texel_count; i++)
@@ -115,74 +116,58 @@ void decompress_symbolic_block(
 
 	if (scb->block_mode < 0)
 	{
-		float red = 0, green = 0, blue = 0, alpha = 0;
+		vfloat4 color;
 		int use_lns = 0;
 		int use_nan = 0;
 
 		if (scb->block_mode == -2)
 		{
-			int ired = scb->constant_color[0];
-			int igreen = scb->constant_color[1];
-			int iblue = scb->constant_color[2];
-			int ialpha = scb->constant_color[3];
+			vint4 colori(scb->constant_color);
 
 			// For sRGB decoding a real decoder would just use the top 8 bits
 			// for color conversion. We don't color convert, so linearly scale
 			// the top 8 bits into the full 16 bit dynamic range
 			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
 			{
-				ired = (ired >> 8) * 257;
-				igreen = (igreen >> 8) * 257;
-				iblue = (iblue >> 8) * 257;
-				ialpha = (ialpha >> 8) * 257;
+				colori = lsr<8>(colori) * 257;
 			}
 
-			red = sf16_to_float(unorm16_to_sf16(ired));
-			green = sf16_to_float(unorm16_to_sf16(igreen));
-			blue = sf16_to_float(unorm16_to_sf16(iblue));
-			alpha = sf16_to_float(unorm16_to_sf16(ialpha));
-			use_lns = 0;
-			use_nan = 0;
+			vint4 colorf16(
+				unorm16_to_sf16(colori.lane<0>()),
+				unorm16_to_sf16(colori.lane<1>()),
+				unorm16_to_sf16(colori.lane<2>()),
+				unorm16_to_sf16(colori.lane<3>())
+			);
+
+			color = float16_to_float(colorf16);
 		}
 		else
 		{
 			switch (decode_mode)
 			{
 			case ASTCENC_PRF_LDR_SRGB:
-				red = 1.0f;
-				green = 0.0f;
-				blue = 1.0f;
-				alpha = 1.0f;
-				use_lns = 0;
-				use_nan = 0;
+				color = vfloat4(1.0f, 0.0f, 1.0f, 1.0f);
 				break;
 			case ASTCENC_PRF_LDR:
-				red = 0.0f;
-				green = 0.0f;
-				blue = 0.0f;
-				alpha = 0.0f;
-				use_lns = 0;
+				color = vfloat4(0.0f);
 				use_nan = 1;
 				break;
 			case ASTCENC_PRF_HDR_RGB_LDR_A:
 			case ASTCENC_PRF_HDR:
 				// constant-color block; unpack from FP16 to FP32.
-				red = sf16_to_float(scb->constant_color[0]);
-				green = sf16_to_float(scb->constant_color[1]);
-				blue = sf16_to_float(scb->constant_color[2]);
-				alpha = sf16_to_float(scb->constant_color[3]);
+				color = float16_to_float(vint4(scb->constant_color));
 				use_lns = 1;
-				use_nan = 0;
 				break;
 			}
 		}
 
+		// TODO: Skip this and add constant color transfer to img block?
 		for (int i = 0; i < bsd->texel_count; i++)
 		{
-			blk->data_r[i] = red;
-			blk->data_g[i] = green;
-			blk->data_b[i] = blue;
-			blk->data_a[i] = alpha;
+			blk->data_r[i] = color.lane<0>();
+			blk->data_g[i] = color.lane<1>();
+			blk->data_b[i] = color.lane<2>();
+			blk->data_a[i] = color.lane<3>();
 			blk->rgb_lns[i] = use_lns;
 			blk->alpha_lns[i] = use_lns;
 			blk->nan_texel[i] = use_nan;
 
@@ -64,6 +64,13 @@ static astcenc_error validate_cpu_isa()
 		}
 	#endif
 
+	#if ASTCENC_F16C >= 1
+		if (!cpu_supports_f16c())
+		{
+			return ASTCENC_ERR_BAD_CPU_ISA;
+		}
+	#endif
+
 	#if ASTCENC_AVX >= 2
 		if (!cpu_supports_avx2())
 		{
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,13 @@ static astcenc_error validate_cpu_isa()`
`64`	`64`	`}`
`65`	`65`	`#endif`
`66`	`66`
	`67`	`+ #if ASTCENC_F16C >= 1`
	`68`	`+ if (!cpu_supports_f16c())`
	`69`	`+ {`
	`70`	`+ return ASTCENC_ERR_BAD_CPU_ISA;`
	`71`	`+ }`
	`72`	`+ #endif`
	`73`	`+`
`67`	`74`	`#if ASTCENC_AVX >= 2`
`68`	`75`	`if (!cpu_supports_avx2())`
`69`	`76`	`{`