ossrs
diff --git a/‎libswscale/aarch64/swscale.c
Lines changed: 5 additions & 0 deletions b/‎libswscale/aarch64/swscale.c
Lines changed: 5 additions & 0 deletions
diff --git a/‎libswscale/hscale.c
Lines changed: 4 additions & 2 deletions b/‎libswscale/hscale.c
Lines changed: 4 additions & 2 deletions
diff --git a/‎libswscale/swscale.c
Lines changed: 93 additions & 20 deletions b/‎libswscale/swscale.c
Lines changed: 93 additions & 20 deletions
diff --git a/‎libswscale/swscale_internal.h
Lines changed: 22 additions & 4 deletions b/‎libswscale/swscale_internal.h
Lines changed: 22 additions & 4 deletions
diff --git a/‎libswscale/x86/swscale.c
Lines changed: 5 additions & 0 deletions b/‎libswscale/x86/swscale.c
Lines changed: 5 additions & 0 deletions
@@ -225,6 +225,10 @@ void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
 
 av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
 {
+    /* This code is currently disabled because of changes in the base
+     * implementation of these functions. This code should be enabled
+     * again once those changes are ported to this architecture. */
+#if 0
     int cpu_flags = av_get_cpu_flags();
 
     if (have_neon(cpu_flags)) {
@@ -238,6 +242,7 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
             }
         }
     }
+#endif
 }
 
 av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
 
@@ -59,7 +59,8 @@ static int lum_h_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, in
         }
 
         if (c->lumConvertRange)
-            c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
+            c->lumConvertRange((int16_t*)dst[dst_pos], dstW,
+                               c->lumConvertRange_coeff, c->lumConvertRange_offset);
 
         desc->dst->plane[0].sliceH += 1;
 
@@ -192,7 +193,8 @@ static int chr_h_scale(SwsInternal *c, SwsFilterDescriptor *desc, int sliceY, in
         }
 
         if (c->chrConvertRange)
-            c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
+            c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW,
+                               c->chrConvertRange_coeff, c->chrConvertRange_offset);
 
         desc->dst->plane[1].sliceH += 1;
         desc->dst->plane[2].sliceH += 1;
 
@@ -156,82 +156,98 @@ static void hScale8To19_c(SwsInternal *c, int16_t *_dst, int dstW,
 
 // FIXME all pal and rgb srcFormats could do this conversion as well
 // FIXME all scalers more complex than bilinear could do half of this transform
-static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width,
+                             uint32_t _coeff, int64_t _offset)
 {
+    uint16_t coeff = _coeff;
+    int32_t offset = _offset;
     int i;
     for (i = 0; i < width; i++) {
-        int U = (dstU[i] * 4663 - 9289992) >> 12; // -264
-        int V = (dstV[i] * 4663 - 9289992) >> 12; // -264
+        int U = (dstU[i] * coeff + offset) >> 14;
+        int V = (dstV[i] * coeff + offset) >> 14;
         dstU[i] = FFMIN(U, (1 << 15) - 1);
         dstV[i] = FFMIN(V, (1 << 15) - 1);
     }
 }
 
-static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
+static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width,
+                               uint32_t _coeff, int64_t _offset)
 {
+    uint16_t coeff = _coeff;
+    int32_t offset = _offset;
     int i;
     for (i = 0; i < width; i++) {
-        dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
-        dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
+        dstU[i] = (dstU[i] * coeff + offset) >> 14;
+        dstV[i] = (dstV[i] * coeff + offset) >> 14;
     }
 }
 
-static void lumRangeToJpeg_c(int16_t *dst, int width)
+static void lumRangeToJpeg_c(int16_t *dst, int width,
+                             uint32_t _coeff, int64_t _offset)
 {
+    uint16_t coeff = _coeff;
+    int32_t offset = _offset;
     int i;
     for (i = 0; i < width; i++) {
-        int Y = (dst[i] * 19077 - 39057361) >> 14;
+        int Y = (dst[i] * coeff + offset) >> 14;
         dst[i] = FFMIN(Y, (1 << 15) - 1);
     }
 }
 
-static void lumRangeFromJpeg_c(int16_t *dst, int width)
+static void lumRangeFromJpeg_c(int16_t *dst, int width,
+                               uint32_t _coeff, int64_t _offset)
 {
+    uint16_t coeff = _coeff;
+    int32_t offset = _offset;
     int i;
     for (i = 0; i < width; i++)
-        dst[i] = (dst[i] * 14071 + 33561947) >> 14;
+        dst[i] = (dst[i] * coeff + offset) >> 14;
 }
 
-static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width,
+                               uint32_t coeff, int64_t offset)
 {
     int i;
     int32_t *dstU = (int32_t *) _dstU;
     int32_t *dstV = (int32_t *) _dstV;
     for (i = 0; i < width; i++) {
-        int U = ((int)(dstU[i] * 4663U - (9289992 << 4))) >> 12; // -264
-        int V = ((int)(dstV[i] * 4663U - (9289992 << 4))) >> 12; // -264
+        int U = ((int64_t) dstU[i] * coeff + offset) >> 18;
+        int V = ((int64_t) dstV[i] * coeff + offset) >> 18;
         dstU[i] = FFMIN(U, (1 << 19) - 1);
         dstV[i] = FFMIN(V, (1 << 19) - 1);
     }
 }
 
-static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
+static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width,
+                                 uint32_t coeff, int64_t offset)
 {
     int i;
     int32_t *dstU = (int32_t *) _dstU;
     int32_t *dstV = (int32_t *) _dstV;
     for (i = 0; i < width; i++) {
-        dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
-        dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
+        dstU[i] = ((int64_t) dstU[i] * coeff + offset) >> 18;
+        dstV[i] = ((int64_t) dstV[i] * coeff + offset) >> 18;
     }
 }
 
-static void lumRangeToJpeg16_c(int16_t *_dst, int width)
+static void lumRangeToJpeg16_c(int16_t *_dst, int width,
+                               uint32_t coeff, int64_t offset)
 {
     int i;
     int32_t *dst = (int32_t *) _dst;
     for (i = 0; i < width; i++) {
-        int Y = ((int)(dst[i] * 4769U - (39057361 << 2))) >> 12;
+        int Y = ((int64_t) dst[i] * coeff + offset) >> 18;
         dst[i] = FFMIN(Y, (1 << 19) - 1);
     }
 }
 
-static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
+static void lumRangeFromJpeg16_c(int16_t *_dst, int width,
+                                 uint32_t coeff, int64_t offset)
 {
     int i;
     int32_t *dst = (int32_t *) _dst;
     for (i = 0; i < width; i++)
-        dst[i] = ((int)(dst[i]*(14071U/4) + (33561947<<4)/4)) >> 12;
+        dst[i] = ((int64_t) dst[i] * coeff + offset) >> 18;
 }
 
 
@@ -547,11 +563,68 @@ int ff_swscale(SwsInternal *c, const uint8_t *const src[], const int srcStride[]
     return dstY - lastDstY;
 }
 
+/*
+ * Solve for coeff and offset:
+ * dst = ((src << src_shift) * coeff + offset) >> (mult_shift + src_shift)
+ *
+ * If SwsInternal->dstBpc is > 14, coeff is uint16_t and offset is int32_t,
+ * otherwise (SwsInternal->dstBpc is <= 14) coeff is uint32_t and offset is
+ * int64_t.
+ */
+static void solve_range_convert(uint16_t src_min, uint16_t src_max,
+                                uint16_t dst_min, uint16_t dst_max,
+                                int src_bits, int src_shift, int mult_shift,
+                                uint32_t *coeff, int64_t *offset)
+{
+    uint16_t src_range = src_max - src_min;
+    uint16_t dst_range = dst_max - dst_min;
+    int total_shift = mult_shift + src_shift;
+    *coeff = AV_CEIL_RSHIFT(((uint64_t) dst_range << total_shift) / src_range, src_shift);
+    *offset = ((int64_t) dst_max << total_shift) -
+              ((int64_t) src_max << src_shift) * *coeff;
+}
+
+static void init_range_convert_constants(SwsInternal *c)
+{
+    const int bit_depth = c->dstBpc ? c->dstBpc : 8;
+    const int src_bits = bit_depth <= 14 ? 15 : 19;
+    const int src_shift = src_bits - bit_depth;
+    const int mult_shift = bit_depth <= 14 ? 14 : 18;
+    const uint16_t mpeg_min = 16U << (bit_depth - 8);
+    const uint16_t mpeg_max_lum = 235U << (bit_depth - 8);
+    const uint16_t mpeg_max_chr = 240U << (bit_depth - 8);
+    const uint16_t jpeg_max = (1U << bit_depth) - 1;
+    uint16_t src_min, src_max_lum, src_max_chr;
+    uint16_t dst_min, dst_max_lum, dst_max_chr;
+    if (c->opts.src_range) {
+        src_min     = 0;
+        src_max_lum = jpeg_max;
+        src_max_chr = jpeg_max;
+        dst_min     = mpeg_min;
+        dst_max_lum = mpeg_max_lum;
+        dst_max_chr = mpeg_max_chr;
+    } else {
+        src_min     = mpeg_min;
+        src_max_lum = mpeg_max_lum;
+        src_max_chr = mpeg_max_chr;
+        dst_min     = 0;
+        dst_max_lum = jpeg_max;
+        dst_max_chr = jpeg_max;
+    }
+    solve_range_convert(src_min, src_max_lum, dst_min, dst_max_lum,
+                        src_bits, src_shift, mult_shift,
+                        &c->lumConvertRange_coeff, &c->lumConvertRange_offset);
+    solve_range_convert(src_min, src_max_chr, dst_min, dst_max_chr,
+                        src_bits, src_shift, mult_shift,
+                        &c->chrConvertRange_coeff, &c->chrConvertRange_offset);
+}
+
 av_cold void ff_sws_init_range_convert(SwsInternal *c)
 {
     c->lumConvertRange = NULL;
     c->chrConvertRange = NULL;
     if (c->opts.src_range != c->opts.dst_range && !isAnyRGB(c->opts.dst_format)) {
+        init_range_convert_constants(c);
         if (c->dstBpc <= 14) {
             if (c->opts.src_range) {
                 c->lumConvertRange = lumRangeFromJpeg_c;
 
@@ -647,10 +647,28 @@ struct SwsInternal {
                     const int32_t *filterPos, int filterSize);
     /** @} */
 
-    /// Color range conversion function for luma plane if needed.
-    void (*lumConvertRange)(int16_t *dst, int width);
-    /// Color range conversion function for chroma planes if needed.
-    void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
+    /**
+     * Color range conversion functions if needed.
+     * If SwsInternal->dstBpc is > 14:
+     * - int16_t *dst (data is 15 bpc)
+     * - uint16_t coeff
+     * - int32_t offset
+     * Otherwise (SwsInternal->dstBpc is <= 14):
+     * - int32_t *dst (data is 19 bpc)
+     * - uint32_t coeff
+     * - int64_t offset
+     */
+    /** @{ */
+    void (*lumConvertRange)(int16_t *dst, int width,
+                            uint32_t coeff, int64_t offset);
+    void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width,
+                            uint32_t coeff, int64_t offset);
+    /** @} */
+
+    uint32_t lumConvertRange_coeff;
+    uint32_t chrConvertRange_coeff;
+    int64_t  lumConvertRange_offset;
+    int64_t  chrConvertRange_offset;
 
     int needs_hcscale; ///< Set if there are chroma planes to be converted.
 
 
@@ -474,12 +474,17 @@ RANGE_CONVERT_FUNCS_DECL(avx2);
 
 av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
 {
+    /* This code is currently disabled because of changes in the base
+     * implementation of these functions. This code should be enabled
+     * again once those changes are ported to this architecture. */
+#if 0
     int cpu_flags = av_get_cpu_flags();
     if (EXTERNAL_AVX2_FAST(cpu_flags)) {
         RANGE_CONVERT_FUNCS(avx2);
     } else if (EXTERNAL_SSE2(cpu_flags)) {
         RANGE_CONVERT_FUNCS(sse2);
     }
+#endif
 }
 
 av_cold void ff_sws_init_swscale_x86(SwsInternal *c)
Original file line number	Diff line number	Diff line change
`@@ -225,6 +225,10 @@ void ff_chrRangeToJpeg_neon(int16_t dstU, int16_t dstV, int width);`
`225`	`225`
`226`	`226`	`av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)`
`227`	`227`	`{`
	`228`	`+ /* This code is currently disabled because of changes in the base`
	`229`	`+ * implementation of these functions. This code should be enabled`
	`230`	`+ * again once those changes are ported to this architecture. */`
	`231`	`+#if 0`
`228`	`232`	`int cpu_flags = av_get_cpu_flags();`
`229`	`233`
`230`	`234`	`if (have_neon(cpu_flags)) {`
`@@ -238,6 +242,7 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)`
`238`	`242`	`}`
`239`	`243`	`}`
`240`	`244`	`}`
	`245`	`+#endif`
`241`	`246`	`}`
`242`	`247`
`243`	`248`	`av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)`
Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,8 @@ static int lum_h_scale(SwsInternal c, SwsFilterDescriptor desc, int sliceY, in`
`59`	`59`	`}`
`60`	`60`
`61`	`61`	`if (c->lumConvertRange)`
`62`		`- c->lumConvertRange((int16_t*)dst[dst_pos], dstW);`
	`62`	`+ c->lumConvertRange((int16_t*)dst[dst_pos], dstW,`
	`63`	`+ c->lumConvertRange_coeff, c->lumConvertRange_offset);`
`63`	`64`
`64`	`65`	`desc->dst->plane[0].sliceH += 1;`
`65`	`66`
`@@ -192,7 +193,8 @@ static int chr_h_scale(SwsInternal c, SwsFilterDescriptor desc, int sliceY, in`
`192`	`193`	`}`
`193`	`194`
`194`	`195`	`if (c->chrConvertRange)`
`195`		`- c->chrConvertRange((uint16_t)dst1[dst_pos1+i], (uint16_t)dst2[dst_pos2+i], dstW);`
	`196`	`+ c->chrConvertRange((uint16_t)dst1[dst_pos1+i], (uint16_t)dst2[dst_pos2+i], dstW,`
	`197`	`+ c->chrConvertRange_coeff, c->chrConvertRange_offset);`
`196`	`198`
`197`	`199`	`desc->dst->plane[1].sliceH += 1;`
`198`	`200`	`desc->dst->plane[2].sliceH += 1;`
Original file line number	Diff line number	Diff line change
`@@ -474,12 +474,17 @@ RANGE_CONVERT_FUNCS_DECL(avx2);`
`474`	`474`
`475`	`475`	`av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)`
`476`	`476`	`{`
	`477`	`+ /* This code is currently disabled because of changes in the base`
	`478`	`+ * implementation of these functions. This code should be enabled`
	`479`	`+ * again once those changes are ported to this architecture. */`
	`480`	`+#if 0`
`477`	`481`	`int cpu_flags = av_get_cpu_flags();`
`478`	`482`	`if (EXTERNAL_AVX2_FAST(cpu_flags)) {`
`479`	`483`	`RANGE_CONVERT_FUNCS(avx2);`
`480`	`484`	`} else if (EXTERNAL_SSE2(cpu_flags)) {`
`481`	`485`	`RANGE_CONVERT_FUNCS(sse2);`
`482`	`486`	`}`
	`487`	`+#endif`
`483`	`488`	`}`
`484`	`489`
`485`	`490`	`av_cold void ff_sws_init_swscale_x86(SwsInternal *c)`