Skip to content

Commit 58bcdeb

Browse files
committed
swscale/aarch64/range_convert: saturate output instead of limiting input
aarch64 A55: chrRangeFromJpeg8_1920_c: 28836.2 (1.00x) chrRangeFromJpeg8_1920_neon: 5312.6 (5.43x) 5313.9 (5.43x) chrRangeToJpeg8_1920_c: 44196.2 (1.00x) chrRangeToJpeg8_1920_neon: 6034.6 (7.32x) 5551.3 (7.96x) lumRangeFromJpeg8_1920_c: 15388.5 (1.00x) lumRangeFromJpeg8_1920_neon: 3150.7 (4.88x) 3152.3 (4.88x) lumRangeToJpeg8_1920_c: 23069.7 (1.00x) lumRangeToJpeg8_1920_neon: 3873.2 (5.96x) 3628.7 (6.36x) aarch64 A76: chrRangeFromJpeg8_1920_c: 6334.7 (1.00x) chrRangeFromJpeg8_1920_neon: 2264.5 (2.80x) 2344.5 (2.70x) chrRangeToJpeg8_1920_c: 11474.5 (1.00x) chrRangeToJpeg8_1920_neon: 2646.5 (4.34x) 2824.2 (4.06x) lumRangeFromJpeg8_1920_c: 4453.2 (1.00x) lumRangeFromJpeg8_1920_neon: 1104.8 (4.03x) 1104.5 (4.03x) lumRangeToJpeg8_1920_c: 6645.0 (1.00x) lumRangeToJpeg8_1920_neon: 1310.5 (5.07x) 1329.8 (5.00x)
1 parent 2d1358a commit 58bcdeb

File tree

2 files changed

+18
-26
lines changed

2 files changed

+18
-26
lines changed

libswscale/aarch64/range_convert_neon.S

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,43 +20,37 @@
2020

2121
#include "libavutil/aarch64/asm.S"
2222

23-
.macro lumConvertRange name, max, mult, offset, shift
23+
.macro lumConvertRange name, fromto, mult, offset, shift
2424
function ff_\name, export=1
25-
.if \max != 0
26-
mov w3, #\max
27-
dup v24.8h, w3
28-
.endif
2925
mov w3, #\mult
3026
dup v25.4s, w3
3127
movz w3, #(\offset & 0xffff)
3228
movk w3, #((\offset >> 16) & 0xffff), lsl #16
3329
dup v26.4s, w3
3430
1:
3531
ld1 {v0.8h}, [x0]
36-
.if \max != 0
37-
smin v0.8h, v0.8h, v24.8h
38-
.endif
3932
mov v16.16b, v26.16b
4033
mov v18.16b, v26.16b
4134
sxtl v20.4s, v0.4h
4235
sxtl2 v22.4s, v0.8h
4336
mla v16.4s, v20.4s, v25.4s
4437
mla v18.4s, v22.4s, v25.4s
38+
.ifc \fromto, To
39+
sqshrn v0.4h, v16.4s, #\shift
40+
sqshrn2 v0.8h, v18.4s, #\shift
41+
.else
4542
shrn v0.4h, v16.4s, #\shift
4643
shrn2 v0.8h, v18.4s, #\shift
44+
.endif
4745
subs w1, w1, #8
4846
st1 {v0.8h}, [x0], #16
4947
b.gt 1b
5048
ret
5149
endfunc
5250
.endm
5351

54-
.macro chrConvertRange name, max, mult, offset, shift
52+
.macro chrConvertRange name, fromto, mult, offset, shift
5553
function ff_\name, export=1
56-
.if \max != 0
57-
mov w3, #\max
58-
dup v24.8h, w3
59-
.endif
6054
mov w3, #\mult
6155
dup v25.4s, w3
6256
movz w3, #(\offset & 0xffff)
@@ -65,10 +59,6 @@ function ff_\name, export=1
6559
1:
6660
ld1 {v0.8h}, [x0]
6761
ld1 {v1.8h}, [x1]
68-
.if \max != 0
69-
smin v0.8h, v0.8h, v24.8h
70-
smin v1.8h, v1.8h, v24.8h
71-
.endif
7262
mov v16.16b, v26.16b
7363
mov v17.16b, v26.16b
7464
mov v18.16b, v26.16b
@@ -81,10 +71,17 @@ function ff_\name, export=1
8171
mla v17.4s, v21.4s, v25.4s
8272
mla v18.4s, v22.4s, v25.4s
8373
mla v19.4s, v23.4s, v25.4s
74+
.ifc \fromto, To
75+
sqshrn v0.4h, v16.4s, #\shift
76+
sqshrn v1.4h, v17.4s, #\shift
77+
sqshrn2 v0.8h, v18.4s, #\shift
78+
sqshrn2 v1.8h, v19.4s, #\shift
79+
.else
8480
shrn v0.4h, v16.4s, #\shift
8581
shrn v1.4h, v17.4s, #\shift
8682
shrn2 v0.8h, v18.4s, #\shift
8783
shrn2 v1.8h, v19.4s, #\shift
84+
.endif
8885
subs w2, w2, #8
8986
st1 {v0.8h}, [x0], #16
9087
st1 {v1.8h}, [x1], #16
@@ -93,7 +90,7 @@ function ff_\name, export=1
9390
endfunc
9491
.endm
9592

96-
lumConvertRange lumRangeToJpeg_neon, 30189, 19077, -39057361, 14
97-
chrConvertRange chrRangeToJpeg_neon, 30775, 4663, -9289992, 12
98-
lumConvertRange lumRangeFromJpeg_neon, 0, 14071, 33561947, 14
99-
chrConvertRange chrRangeFromJpeg_neon, 0, 1799, 4081085, 11
93+
lumConvertRange lumRangeToJpeg_neon, To, 19077, -39057361, 14
94+
chrConvertRange chrRangeToJpeg_neon, To, 4663, -9289992, 12
95+
lumConvertRange lumRangeFromJpeg_neon, From, 14071, 33561947, 14
96+
chrConvertRange chrRangeFromJpeg_neon, From, 1799, 4081085, 11

libswscale/aarch64/swscale.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,10 +225,6 @@ void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
225225

226226
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
227227
{
228-
/* This code is currently disabled because of changes in the base
229-
* implementation of these functions. This code should be enabled
230-
* again once those changes are ported to this architecture. */
231-
#if 0
232228
int cpu_flags = av_get_cpu_flags();
233229

234230
if (have_neon(cpu_flags)) {
@@ -242,7 +238,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
242238
}
243239
}
244240
}
245-
#endif
246241
}
247242

248243
av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)

0 commit comments

Comments
 (0)