Skip to content

Commit 384fe39

Browse files
committed
swscale/range_convert: fix mpeg ranges in yuv range conversion for non-8-bit pixel formats
There is an issue with the constants used in YUV to YUV range conversion, where the upper bound is not respected when converting to mpeg range. With this commit, the constants are calculated at runtime, depending on the bit depth. This approach also allows us to more easily understand how the constants are derived. For bit depths <= 14, the number of fixed point bits has been set to 14 for all conversions, to simplify the code. For bit depths > 14, the number of fixed points bits has been raised and set to 18, to allow for the conversion to be accurate enough for the mpeg range to be respected. The convert functions now take the conversion constants (coeff and offset) as function arguments. For bit depths <= 14, coeff is unsigned 16-bit and offset is 32-bit. For bit depths > 14, coeff is unsigned 32-bit and offset is 64-bit. x86_64: chrRangeFromJpeg8_1920_c: 2127.4 2125.0 (1.00x) chrRangeFromJpeg16_1920_c: 2325.2 2127.2 (1.09x) chrRangeToJpeg8_1920_c: 3166.9 3168.7 (1.00x) chrRangeToJpeg16_1920_c: 2152.4 3164.8 (0.68x) lumRangeFromJpeg8_1920_c: 1263.0 1302.5 (0.97x) lumRangeFromJpeg16_1920_c: 1080.5 1299.2 (0.83x) lumRangeToJpeg8_1920_c: 1886.8 2112.2 (0.89x) lumRangeToJpeg16_1920_c: 1077.0 1906.5 (0.56x) aarch64 A55: chrRangeFromJpeg8_1920_c: 28835.2 28835.6 (1.00x) chrRangeFromJpeg16_1920_c: 28839.8 32680.8 (0.88x) chrRangeToJpeg8_1920_c: 23074.7 23075.4 (1.00x) chrRangeToJpeg16_1920_c: 17318.9 24996.0 (0.69x) lumRangeFromJpeg8_1920_c: 15389.7 15384.5 (1.00x) lumRangeFromJpeg16_1920_c: 15388.2 17306.7 (0.89x) lumRangeToJpeg8_1920_c: 19227.8 19226.6 (1.00x) lumRangeToJpeg16_1920_c: 15387.0 21146.3 (0.73x) aarch64 A76: chrRangeFromJpeg8_1920_c: 6324.4 6268.1 (1.01x) chrRangeFromJpeg16_1920_c: 6339.9 11521.5 (0.55x) chrRangeToJpeg8_1920_c: 9656.0 9612.8 (1.00x) chrRangeToJpeg16_1920_c: 6340.4 11651.8 (0.54x) lumRangeFromJpeg8_1920_c: 4422.0 4420.8 (1.00x) lumRangeFromJpeg16_1920_c: 4420.9 5762.0 (0.77x) lumRangeToJpeg8_1920_c: 5949.1 5977.5 (1.00x) lumRangeToJpeg16_1920_c: 4446.8 5946.2 (0.75x) NOTE: all simd optimizations for range_convert have been disabled. they will be re-enabled when they are fixed for each architecture. NOTE2: the same issue still exists in rgb2yuv conversions, which is not addressed in this commit.
1 parent 58bcdeb commit 384fe39

File tree

184 files changed

+880
-725
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

184 files changed

+880
-725
lines changed

libswscale/aarch64/swscale.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,10 @@ void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
225225

226226
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
227227
{
228+
/* This code is currently disabled because of changes in the base
229+
* implementation of these functions. This code should be enabled
230+
* again once those changes are ported to this architecture. */
231+
#if 0
228232
int cpu_flags = av_get_cpu_flags();
229233

230234
if (have_neon(cpu_flags)) {
@@ -238,6 +242,7 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
238242
}
239243
}
240244
}
245+
#endif
241246
}
242247

243248
av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)

libswscale/hscale.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ static int lum_h_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, in
5959
}
6060

6161
if (c->lumConvertRange)
62-
c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
62+
c->lumConvertRange((int16_t*)dst[dst_pos], dstW,
63+
c->lumConvertRange_coeff, c->lumConvertRange_offset);
6364

6465
desc->dst->plane[0].sliceH += 1;
6566

@@ -192,7 +193,8 @@ static int chr_h_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, in
192193
}
193194

194195
if (c->chrConvertRange)
195-
c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
196+
c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW,
197+
c->chrConvertRange_coeff, c->chrConvertRange_offset);
196198

197199
desc->dst->plane[1].sliceH += 1;
198200
desc->dst->plane[2].sliceH += 1;

libswscale/swscale.c

Lines changed: 93 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -156,82 +156,98 @@ static void hScale8To19_c(SwsInternal *c, int16_t *_dst, int dstW,
156156

157157
// FIXME all pal and rgb srcFormats could do this conversion as well
158158
// FIXME all scalers more complex than bilinear could do half of this transform
159-
static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
159+
static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width,
160+
uint32_t _coeff, int64_t _offset)
160161
{
162+
uint16_t coeff = _coeff;
163+
int32_t offset = _offset;
161164
int i;
162165
for (i = 0; i < width; i++) {
163-
int U = (dstU[i] * 4663 - 9289992) >> 12; // -264
164-
int V = (dstV[i] * 4663 - 9289992) >> 12; // -264
166+
int U = (dstU[i] * coeff + offset) >> 14;
167+
int V = (dstV[i] * coeff + offset) >> 14;
165168
dstU[i] = FFMIN(U, (1 << 15) - 1);
166169
dstV[i] = FFMIN(V, (1 << 15) - 1);
167170
}
168171
}
169172

170-
static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
173+
static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width,
174+
uint32_t _coeff, int64_t _offset)
171175
{
176+
uint16_t coeff = _coeff;
177+
int32_t offset = _offset;
172178
int i;
173179
for (i = 0; i < width; i++) {
174-
dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
175-
dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
180+
dstU[i] = (dstU[i] * coeff + offset) >> 14;
181+
dstV[i] = (dstV[i] * coeff + offset) >> 14;
176182
}
177183
}
178184

179-
static void lumRangeToJpeg_c(int16_t *dst, int width)
185+
static void lumRangeToJpeg_c(int16_t *dst, int width,
186+
uint32_t _coeff, int64_t _offset)
180187
{
188+
uint16_t coeff = _coeff;
189+
int32_t offset = _offset;
181190
int i;
182191
for (i = 0; i < width; i++) {
183-
int Y = (dst[i] * 19077 - 39057361) >> 14;
192+
int Y = (dst[i] * coeff + offset) >> 14;
184193
dst[i] = FFMIN(Y, (1 << 15) - 1);
185194
}
186195
}
187196

188-
static void lumRangeFromJpeg_c(int16_t *dst, int width)
197+
static void lumRangeFromJpeg_c(int16_t *dst, int width,
198+
uint32_t _coeff, int64_t _offset)
189199
{
200+
uint16_t coeff = _coeff;
201+
int32_t offset = _offset;
190202
int i;
191203
for (i = 0; i < width; i++)
192-
dst[i] = (dst[i] * 14071 + 33561947) >> 14;
204+
dst[i] = (dst[i] * coeff + offset) >> 14;
193205
}
194206

195-
static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
207+
static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width,
208+
uint32_t coeff, int64_t offset)
196209
{
197210
int i;
198211
int32_t *dstU = (int32_t *) _dstU;
199212
int32_t *dstV = (int32_t *) _dstV;
200213
for (i = 0; i < width; i++) {
201-
int U = ((int)(dstU[i] * 4663U - (9289992 << 4))) >> 12; // -264
202-
int V = ((int)(dstV[i] * 4663U - (9289992 << 4))) >> 12; // -264
214+
int U = ((int64_t) dstU[i] * coeff + offset) >> 18;
215+
int V = ((int64_t) dstV[i] * coeff + offset) >> 18;
203216
dstU[i] = FFMIN(U, (1 << 19) - 1);
204217
dstV[i] = FFMIN(V, (1 << 19) - 1);
205218
}
206219
}
207220

208-
static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
221+
static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width,
222+
uint32_t coeff, int64_t offset)
209223
{
210224
int i;
211225
int32_t *dstU = (int32_t *) _dstU;
212226
int32_t *dstV = (int32_t *) _dstV;
213227
for (i = 0; i < width; i++) {
214-
dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
215-
dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
228+
dstU[i] = ((int64_t) dstU[i] * coeff + offset) >> 18;
229+
dstV[i] = ((int64_t) dstV[i] * coeff + offset) >> 18;
216230
}
217231
}
218232

219-
static void lumRangeToJpeg16_c(int16_t *_dst, int width)
233+
static void lumRangeToJpeg16_c(int16_t *_dst, int width,
234+
uint32_t coeff, int64_t offset)
220235
{
221236
int i;
222237
int32_t *dst = (int32_t *) _dst;
223238
for (i = 0; i < width; i++) {
224-
int Y = ((int)(dst[i] * 4769U - (39057361 << 2))) >> 12;
239+
int Y = ((int64_t) dst[i] * coeff + offset) >> 18;
225240
dst[i] = FFMIN(Y, (1 << 19) - 1);
226241
}
227242
}
228243

229-
static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
244+
static void lumRangeFromJpeg16_c(int16_t *_dst, int width,
245+
uint32_t coeff, int64_t offset)
230246
{
231247
int i;
232248
int32_t *dst = (int32_t *) _dst;
233249
for (i = 0; i < width; i++)
234-
dst[i] = ((int)(dst[i]*(14071U/4) + (33561947<<4)/4)) >> 12;
250+
dst[i] = ((int64_t) dst[i] * coeff + offset) >> 18;
235251
}
236252

237253

@@ -547,11 +563,68 @@ int ff_swscale(SwsInternal *c, const uint8_t *const src[], const int srcStride[]
547563
return dstY - lastDstY;
548564
}
549565

566+
/*
567+
* Solve for coeff and offset:
568+
* dst = ((src << src_shift) * coeff + offset) >> (mult_shift + src_shift)
569+
*
570+
* If SwsInternal->dstBpc is > 14, coeff is uint16_t and offset is int32_t,
571+
* otherwise (SwsInternal->dstBpc is <= 14) coeff is uint32_t and offset is
572+
* int64_t.
573+
*/
574+
static void solve_range_convert(uint16_t src_min, uint16_t src_max,
575+
uint16_t dst_min, uint16_t dst_max,
576+
int src_bits, int src_shift, int mult_shift,
577+
uint32_t *coeff, int64_t *offset)
578+
{
579+
uint16_t src_range = src_max - src_min;
580+
uint16_t dst_range = dst_max - dst_min;
581+
int total_shift = mult_shift + src_shift;
582+
*coeff = AV_CEIL_RSHIFT(((uint64_t) dst_range << total_shift) / src_range, src_shift);
583+
*offset = ((int64_t) dst_max << total_shift) -
584+
((int64_t) src_max << src_shift) * *coeff;
585+
}
586+
587+
static void init_range_convert_constants(SwsInternal *c)
588+
{
589+
const int bit_depth = c->dstBpc ? c->dstBpc : 8;
590+
const int src_bits = bit_depth <= 14 ? 15 : 19;
591+
const int src_shift = src_bits - bit_depth;
592+
const int mult_shift = bit_depth <= 14 ? 14 : 18;
593+
const uint16_t mpeg_min = 16U << (bit_depth - 8);
594+
const uint16_t mpeg_max_lum = 235U << (bit_depth - 8);
595+
const uint16_t mpeg_max_chr = 240U << (bit_depth - 8);
596+
const uint16_t jpeg_max = (1U << bit_depth) - 1;
597+
uint16_t src_min, src_max_lum, src_max_chr;
598+
uint16_t dst_min, dst_max_lum, dst_max_chr;
599+
if (c->opts.src_range) {
600+
src_min = 0;
601+
src_max_lum = jpeg_max;
602+
src_max_chr = jpeg_max;
603+
dst_min = mpeg_min;
604+
dst_max_lum = mpeg_max_lum;
605+
dst_max_chr = mpeg_max_chr;
606+
} else {
607+
src_min = mpeg_min;
608+
src_max_lum = mpeg_max_lum;
609+
src_max_chr = mpeg_max_chr;
610+
dst_min = 0;
611+
dst_max_lum = jpeg_max;
612+
dst_max_chr = jpeg_max;
613+
}
614+
solve_range_convert(src_min, src_max_lum, dst_min, dst_max_lum,
615+
src_bits, src_shift, mult_shift,
616+
&c->lumConvertRange_coeff, &c->lumConvertRange_offset);
617+
solve_range_convert(src_min, src_max_chr, dst_min, dst_max_chr,
618+
src_bits, src_shift, mult_shift,
619+
&c->chrConvertRange_coeff, &c->chrConvertRange_offset);
620+
}
621+
550622
av_cold void ff_sws_init_range_convert(SwsInternal *c)
551623
{
552624
c->lumConvertRange = NULL;
553625
c->chrConvertRange = NULL;
554626
if (c->opts.src_range != c->opts.dst_range && !isAnyRGB(c->opts.dst_format)) {
627+
init_range_convert_constants(c);
555628
if (c->dstBpc <= 14) {
556629
if (c->opts.src_range) {
557630
c->lumConvertRange = lumRangeFromJpeg_c;

libswscale/swscale_internal.h

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -647,10 +647,28 @@ struct SwsInternal {
647647
const int32_t *filterPos, int filterSize);
648648
/** @} */
649649

650-
/// Color range conversion function for luma plane if needed.
651-
void (*lumConvertRange)(int16_t *dst, int width);
652-
/// Color range conversion function for chroma planes if needed.
653-
void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
650+
/**
651+
* Color range conversion functions if needed.
652+
* If SwsInternal->dstBpc is > 14:
653+
* - int16_t *dst (data is 15 bpc)
654+
* - uint16_t coeff
655+
* - int32_t offset
656+
* Otherwise (SwsInternal->dstBpc is <= 14):
657+
* - int32_t *dst (data is 19 bpc)
658+
* - uint32_t coeff
659+
* - int64_t offset
660+
*/
661+
/** @{ */
662+
void (*lumConvertRange)(int16_t *dst, int width,
663+
uint32_t coeff, int64_t offset);
664+
void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width,
665+
uint32_t coeff, int64_t offset);
666+
/** @} */
667+
668+
uint32_t lumConvertRange_coeff;
669+
uint32_t chrConvertRange_coeff;
670+
int64_t lumConvertRange_offset;
671+
int64_t chrConvertRange_offset;
654672

655673
int needs_hcscale; ///< Set if there are chroma planes to be converted.
656674

libswscale/x86/swscale.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,12 +474,17 @@ RANGE_CONVERT_FUNCS_DECL(avx2);
474474

475475
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
476476
{
477+
/* This code is currently disabled because of changes in the base
478+
* implementation of these functions. This code should be enabled
479+
* again once those changes are ported to this architecture. */
480+
#if 0
477481
int cpu_flags = av_get_cpu_flags();
478482
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
479483
RANGE_CONVERT_FUNCS(avx2);
480484
} else if (EXTERNAL_SSE2(cpu_flags)) {
481485
RANGE_CONVERT_FUNCS(sse2);
482486
}
487+
#endif
483488
}
484489

485490
av_cold void ff_sws_init_swscale_x86(SwsInternal *c)

0 commit comments

Comments
 (0)