Skip to content

Commit ef2c2a2

Browse files
committed
avutil/half2float: use native _Float16 if available
_Float16 support was available on arm/aarch64 for a while, and with gcc 12 was enabled on x86 as long as SSE2 is supported. If the target arch supports f16c, gcc emits fairly efficient assembly, taking advantage of it. This is the case on x86-64-v3 or higher. Same goes on arm, which has native float16 support. On x86, without f16c, it emulates it in software using sse2 instructions. This has shown to perform rather poorly: _Float16 full SSE2 emulation: frame=50074 fps=848 q=-0.0 size=N/A time=00:33:22.96 bitrate=N/A speed=33.9x _Float16 f16c accelerated (Zen2, --cpu=znver2): frame=50636 fps=1965 q=-0.0 Lsize=N/A time=00:33:45.40 bitrate=N/A speed=78.6x classic half2float full software implementation: frame=49926 fps=1605 q=-0.0 Lsize=N/A time=00:33:17.00 bitrate=N/A speed=64.2x Hence an additional check was introduced, that only enables use of _Float16 on x86 if f16c is being utilized. On aarch64, a similar uplift in performance is seen: RPi4 half2float full software implementation: frame= 6088 fps=126 q=-0.0 Lsize=N/A time=00:04:03.48 bitrate=N/A speed=5.06x RPi4 _Float16: frame= 6103 fps=158 q=-0.0 Lsize=N/A time=00:04:04.08 bitrate=N/A speed=6.32x Since arm/aarch64 always natively support 16 bit floats, it can always be considered fast there. I'm not aware of any additional platforms that currently support _Float16. And if there are, they should be considered non-fast until proven fast.
1 parent 6dc79f1 commit ef2c2a2

File tree

5 files changed

+50
-0
lines changed

5 files changed

+50
-0
lines changed

configure

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2145,6 +2145,7 @@ ARCH_FEATURES="
21452145
fast_64bit
21462146
fast_clz
21472147
fast_cmov
2148+
fast_float16
21482149
local_aligned
21492150
simd_align_16
21502151
simd_align_32
@@ -5127,6 +5128,8 @@ elif enabled arm; then
51275128
;;
51285129
esac
51295130

5131+
test_cflags -mfp16-format=ieee && add_cflags -mfp16-format=ieee
5132+
51305133
elif enabled avr32; then
51315134

51325135
case $cpu in
@@ -6231,6 +6234,15 @@ check_builtin sync_val_compare_and_swap "" "int *ptr; int oldval, newval; __sync
62316234
check_builtin gmtime_r time.h "time_t *time; struct tm *tm; gmtime_r(time, tm)"
62326235
check_builtin localtime_r time.h "time_t *time; struct tm *tm; localtime_r(time, tm)"
62336236

6237+
check_builtin float16 "" "_Float16 f16var"
6238+
if enabled float16; then
6239+
if enabled x86; then
6240+
test_cpp_condition stddef.h "defined(__F16C__)" && enable fast_float16
6241+
elif enabled arm || enabled aarch64; then
6242+
enable fast_float16
6243+
fi
6244+
fi
6245+
62346246
case "$custom_allocator" in
62356247
jemalloc)
62366248
# jemalloc by default does not use a prefix

libavutil/float2half.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
void ff_init_float2half_tables(Float2HalfTables *t)
2222
{
23+
#if !HAVE_FAST_FLOAT16
2324
for (int i = 0; i < 256; i++) {
2425
int e = i - 127;
2526

@@ -50,4 +51,5 @@ void ff_init_float2half_tables(Float2HalfTables *t)
5051
t->shifttable[i|0x100] = 13;
5152
}
5253
}
54+
#endif
5355
}

libavutil/float2half.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,37 @@
2020
#define AVUTIL_FLOAT2HALF_H
2121

2222
#include <stdint.h>
23+
#include "intfloat.h"
24+
25+
#include "config.h"
2326

2427
typedef struct Float2HalfTables {
28+
#if HAVE_FAST_FLOAT16
29+
uint8_t dummy;
30+
#else
2531
uint16_t basetable[512];
2632
uint8_t shifttable[512];
33+
#endif
2734
} Float2HalfTables;
2835

2936
void ff_init_float2half_tables(Float2HalfTables *t);
3037

3138
static inline uint16_t float2half(uint32_t f, const Float2HalfTables *t)
3239
{
40+
#if HAVE_FAST_FLOAT16
41+
union {
42+
_Float16 f;
43+
uint16_t i;
44+
} u;
45+
u.f = av_int2float(f);
46+
return u.i;
47+
#else
3348
uint16_t h;
3449

3550
h = t->basetable[(f >> 23) & 0x1ff] + ((f & 0x007fffff) >> t->shifttable[(f >> 23) & 0x1ff]);
3651

3752
return h;
53+
#endif
3854
}
3955

4056
#endif /* AVUTIL_FLOAT2HALF_H */

libavutil/half2float.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include "libavutil/half2float.h"
2020

21+
#if !HAVE_FAST_FLOAT16
2122
static uint32_t convertmantissa(uint32_t i)
2223
{
2324
int32_t m = i << 13; // Zero pad mantissa bits
@@ -33,9 +34,11 @@ static uint32_t convertmantissa(uint32_t i)
3334

3435
return m | e; // Return combined number
3536
}
37+
#endif
3638

3739
void ff_init_half2float_tables(Half2FloatTables *t)
3840
{
41+
#if !HAVE_FAST_FLOAT16
3942
t->mantissatable[0] = 0;
4043
for (int i = 1; i < 1024; i++)
4144
t->mantissatable[i] = convertmantissa(i);
@@ -60,4 +63,5 @@ void ff_init_half2float_tables(Half2FloatTables *t)
6063
t->offsettable[31] = 2048;
6164
t->offsettable[32] = 0;
6265
t->offsettable[63] = 2048;
66+
#endif
6367
}

libavutil/half2float.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,38 @@
2020
#define AVUTIL_HALF2FLOAT_H
2121

2222
#include <stdint.h>
23+
#include "intfloat.h"
24+
25+
#include "config.h"
2326

2427
typedef struct Half2FloatTables {
28+
#if HAVE_FAST_FLOAT16
29+
uint8_t dummy;
30+
#else
2531
uint32_t mantissatable[3072];
2632
uint32_t exponenttable[64];
2733
uint16_t offsettable[64];
34+
#endif
2835
} Half2FloatTables;
2936

3037
void ff_init_half2float_tables(Half2FloatTables *t);
3138

3239
static inline uint32_t half2float(uint16_t h, const Half2FloatTables *t)
3340
{
41+
#if HAVE_FAST_FLOAT16
42+
union {
43+
_Float16 f;
44+
uint16_t i;
45+
} u;
46+
u.i = h;
47+
return av_float2int(u.f);
48+
#else
3449
uint32_t f;
3550

3651
f = t->mantissatable[t->offsettable[h >> 10] + (h & 0x3ff)] + t->exponenttable[h >> 10];
3752

3853
return f;
54+
#endif
3955
}
4056

4157
#endif /* AVUTIL_HALF2FLOAT_H */

0 commit comments

Comments
 (0)