From 0c046052966db54872885da197776cf9f93343a3 Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Sat, 28 Jun 2025 17:18:14 +0300
Subject: [PATCH 1/3] better support for u256 and i256

---
 libm-test/tests/u256.rs             |   8 +-
 libm/src/math/support/big.rs        | 133 +++++++++++++++++-----------
 libm/src/math/support/big/tests.rs  |  63 ++++++++++++-
 libm/src/math/support/int_traits.rs |  23 ++++-
 4 files changed, 169 insertions(+), 58 deletions(-)
diff --git a/libm-test/tests/u256.rs b/libm-test/tests/u256.rs
index 8cbb3ad22..e224ccf7a 100644
--- a/libm-test/tests/u256.rs
+++ b/libm-test/tests/u256.rs
@@ -111,7 +111,13 @@ fn mp_u256_add() {
         let y = random_u256(&mut rng);
         assign_bigint(&mut bx, x);
         assign_bigint(&mut by, y);
-        let actual = x + y;
+        let actual = if u256::MAX - x >= y {
+            x + y
+        } else {
+            // otherwise (u256::MAX - x) < y, so the wrapped result is
+            // (x + y) - (u256::MAX + 1) == y - (u256::MAX - x) - 1
+            y - (u256::MAX - x) - 1_u128.widen()
+        };
         bx += &by;
         check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
     }
diff --git a/libm/src/math/support/big.rs b/libm/src/math/support/big.rs
index 8a52d86cc..b7f128542 100644
--- a/libm/src/math/support/big.rs
+++ b/libm/src/math/support/big.rs
@@ -11,10 +11,10 @@ const U128_LO_MASK: u128 = u64::MAX as u128;
 
 /// A 256-bit unsigned integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
-#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 pub struct u256 {
-    pub lo: u128,
     pub hi: u128,
+    pub lo: u128,
 }
 
 impl u256 {
@@ -28,17 +28,17 @@ impl u256 {
     pub fn signed(self) -> i256 {
         i256 {
             lo: self.lo,
-            hi: self.hi,
+            hi: self.hi as i128,
         }
     }
 }
 
 /// A 256-bit signed integer represented as two 128-bit native-endian limbs.
 #[allow(non_camel_case_types)]
-#[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
+#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
 pub struct i256 {
+    pub hi: i128,
     pub lo: u128,
-    pub hi: u128,
 }
 
 impl i256 {
@@ -47,7 +47,7 @@ impl i256 {
     pub fn unsigned(self) -> u256 {
         u256 {
             lo: self.lo,
-            hi: self.hi,
+            hi: self.hi as u128,
         }
     }
 }
@@ -73,17 +73,17 @@ impl MinInt for i256 {
 
     type Unsigned = u256;
 
-    const SIGNED: bool = false;
+    const SIGNED: bool = true;
     const BITS: u32 = 256;
     const ZERO: Self = Self { lo: 0, hi: 0 };
     const ONE: Self = Self { lo: 1, hi: 0 };
     const MIN: Self = Self {
-        lo: 0,
-        hi: 1 << 127,
+        lo: u128::MIN,
+        hi: i128::MIN,
     };
     const MAX: Self = Self {
         lo: u128::MAX,
-        hi: u128::MAX >> 1,
+        hi: i128::MAX,
     };
 }
 
@@ -109,60 +109,86 @@ macro_rules! impl_common {
             }
         }
 
-        impl ops::Shl<u32> for $ty {
+        impl ops::Add<Self> for $ty {
             type Output = Self;
 
-            fn shl(self, _rhs: u32) -> Self::Output {
-                unimplemented!("only used to meet trait bounds")
+            fn add(self, rhs: Self) -> Self::Output {
+                let (lo, carry) = self.lo.overflowing_add(rhs.lo);
+                let (hi, of) = Int::carrying_add(self.hi, rhs.hi, carry);
+                debug_assert!(!of, "attempt to add with overflow");
+                Self { lo, hi }
             }
         }
-    };
-}
 
-impl_common!(i256);
-impl_common!(u256);
+        impl ops::Sub<Self> for $ty {
+            type Output = Self;
 
-impl ops::Add<Self> for u256 {
-    type Output = Self;
+            fn sub(self, rhs: Self) -> Self::Output {
+                let (lo, borrow) = self.lo.overflowing_sub(rhs.lo);
+                let (hi, of) = Int::borrowing_sub(self.hi, rhs.hi, borrow);
+                debug_assert!(!of, "attempt to subtract with overflow");
+                Self { lo, hi }
+            }
+        }
 
-    fn add(self, rhs: Self) -> Self::Output {
-        let (lo, carry) = self.lo.overflowing_add(rhs.lo);
-        let hi = self.hi.wrapping_add(carry as u128).wrapping_add(rhs.hi);
+        impl ops::Shl<u32> for $ty {
+            type Output = Self;
 
-        Self { lo, hi }
-    }
-}
+            fn shl(mut self, rhs: u32) -> Self::Output {
+                debug_assert!(rhs < Self::BITS, "attempt to shift left with overflow");
 
-impl ops::Shr<u32> for u256 {
-    type Output = Self;
+                let half_bits = Self::BITS / 2;
+                let low_mask = half_bits - 1;
+                let s = rhs & low_mask;
 
-    fn shr(mut self, rhs: u32) -> Self::Output {
-        debug_assert!(rhs < Self::BITS, "attempted to shift right with overflow");
-        if rhs >= Self::BITS {
-            return Self::ZERO;
-        }
+                let lo = self.lo;
+                let hi = self.hi;
 
-        if rhs == 0 {
-            return self;
-        }
+                self.lo = lo << s;
 
-        if rhs < 128 {
-            self.lo >>= rhs;
-            self.lo |= self.hi << (128 - rhs);
-        } else {
-            self.lo = self.hi >> (rhs - 128);
+                if rhs & half_bits == 0 {
+                    self.hi = (lo >> (low_mask ^ s) >> 1) as _;
+                    self.hi |= hi << s;
+                } else {
+                    self.hi = self.lo as _;
+                    self.lo = 0;
+                }
+                self
+            }
         }
 
-        if rhs < 128 {
-            self.hi >>= rhs;
-        } else {
-            self.hi = 0;
-        }
+        impl ops::Shr<u32> for $ty {
+            type Output = Self;
 
-        self
-    }
+            fn shr(mut self, rhs: u32) -> Self::Output {
+                debug_assert!(rhs < Self::BITS, "attempt to shift right with overflow");
+
+                let half_bits = Self::BITS / 2;
+                let low_mask = half_bits - 1;
+                let s = rhs & low_mask;
+
+                let lo = self.lo;
+                let hi = self.hi;
+
+                self.hi = hi >> s;
+
+                #[allow(unused_comparisons)]
+                if rhs & half_bits == 0 {
+                    self.lo = (hi << (low_mask ^ s) << 1) as _;
+                    self.lo |= lo >> s;
+                } else {
+                    self.lo = self.hi as _;
+                    self.hi = if hi < 0 { !0 } else { 0 };
+                }
+                self
+            }
+        }
+    };
 }
 
+impl_common!(i256);
+impl_common!(u256);
+
 impl HInt for u128 {
     type D = u256;
 
@@ -200,7 +226,7 @@ impl HInt for u128 {
     }
 
     fn widen_hi(self) -> Self::D {
-        self.widen() << <Self as MinInt>::BITS
+        u256 { lo: 0, hi: self }
     }
 }
 
@@ -208,11 +234,10 @@ impl HInt for i128 {
     type D = i256;
 
     fn widen(self) -> Self::D {
-        let mut ret = self.unsigned().zero_widen().signed();
-        if self.is_negative() {
-            ret.hi = u128::MAX;
+        i256 {
+            lo: self as u128,
+            hi: if self < 0 { -1 } else { 0 },
         }
-        ret
     }
 
     fn zero_widen(self) -> Self::D {
@@ -228,7 +253,7 @@ impl HInt for i128 {
     }
 
     fn widen_hi(self) -> Self::D {
-        self.widen() << <Self as MinInt>::BITS
+        i256 { lo: 0, hi: self }
     }
 }
 
@@ -252,6 +277,6 @@ impl DInt for i256 {
     }
 
     fn hi(self) -> Self::H {
-        self.hi as i128
+        self.hi
     }
 }
diff --git a/libm/src/math/support/big/tests.rs b/libm/src/math/support/big/tests.rs
index d2010f021..d54706c72 100644
--- a/libm/src/math/support/big/tests.rs
+++ b/libm/src/math/support/big/tests.rs
@@ -36,7 +36,7 @@ fn widen_i128() {
         (LOHI_SPLIT as i128).widen(),
         i256 {
             lo: LOHI_SPLIT,
-            hi: u128::MAX
+            hi: -1,
         }
     );
     assert_eq!((-1i128).zero_widen().unsigned(), (u128::MAX).widen());
@@ -275,3 +275,64 @@ fn shr_u256_overflow() {
     assert_eq!(u256::MAX >> 257, u256::ZERO);
     assert_eq!(u256::MAX >> u32::MAX, u256::ZERO);
 }
+
+#[test]
+fn u256_ord() {
+    let _1 = u256::ONE;
+    let _2 = _1 + _1;
+    for x in u8::MIN..u8::MAX {
+        let y = x + 1;
+        let wx = (x as u128).widen_hi();
+        let wy = (y as u128).widen_hi();
+        assert!([wx, wx + _1, wx + _2, wy, wy + _1, wy + _2].is_sorted());
+    }
+}
+#[test]
+fn i256_ord() {
+    let _1 = i256::ONE;
+    let _2 = _1 + _1;
+    for x in i8::MIN..i8::MAX {
+        let y = x + 1;
+        let wx = (x as i128).widen_hi();
+        let wy = (y as i128).widen_hi();
+        assert!([wx, wx + _1, wx + _2, wy - _2, wy - _1, wy].is_sorted());
+    }
+}
+
+#[test]
+fn u256_shifts() {
+    let _1 = u256::ONE;
+    for k in 0..255 {
+        let x = _1 << k;
+        let x2 = _1 << (k + 1);
+        assert!(x < x2);
+        assert_eq!(x << 1, x2);
+        assert_eq!(x + x, x2);
+        assert_eq!(x >> k, _1);
+        assert_eq!(x2 >> (k + 1), _1);
+    }
+}
+#[test]
+fn i256_shifts() {
+    let _1 = i256::ONE;
+    for k in 0..254 {
+        let x = _1 << k;
+        let x2 = _1 << (k + 1);
+        assert!(x < x2);
+        assert_eq!(x << 1, x2);
+        assert_eq!(x + x, x2);
+        assert_eq!(x >> k, _1);
+        assert_eq!(x2 >> (k + 1), _1);
+    }
+
+    let min = _1 << 255;
+    assert_eq!(min, i256::MIN);
+    let mut x = min;
+    for k in 0..255 {
+        assert_eq!(x, min >> k);
+        let y = x >> 1;
+        assert_eq!(y + y, x);
+        assert!(x < y);
+        x = y;
+    }
+}
diff --git a/libm/src/math/support/int_traits.rs b/libm/src/math/support/int_traits.rs
index 9b29e2f45..9d8826dfe 100644
--- a/libm/src/math/support/int_traits.rs
+++ b/libm/src/math/support/int_traits.rs
@@ -37,8 +37,6 @@ pub trait Int:
     + fmt::Display
     + fmt::Binary
     + fmt::LowerHex
-    + PartialEq
-    + PartialOrd
     + ops::AddAssign
     + ops::SubAssign
     + ops::MulAssign
@@ -102,7 +100,10 @@ pub trait Int:
     fn rotate_left(self, other: u32) -> Self;
     fn overflowing_add(self, other: Self) -> (Self, bool);
     fn overflowing_sub(self, other: Self) -> (Self, bool);
+    fn carrying_add(self, other: Self, carry: bool) -> (Self, bool);
+    fn borrowing_sub(self, other: Self, borrow: bool) -> (Self, bool);
     fn leading_zeros(self) -> u32;
+    fn trailing_zeros(self) -> u32;
     fn ilog2(self) -> u32;
 }
 
@@ -168,12 +169,30 @@ macro_rules! int_impl_common {
             <Self>::leading_zeros(self)
         }
 
+        fn trailing_zeros(self) -> u32 {
+            <Self>::trailing_zeros(self)
+        }
+
         fn ilog2(self) -> u32 {
             // On our older MSRV, this resolves to the trait method. Which won't actually work,
             // but this is only called behind other gates.
             #[allow(clippy::incompatible_msrv)]
             <Self>::ilog2(self)
         }
+
+        fn carrying_add(self, other: Self, carry: bool) -> (Self, bool) {
+            let (ab, of1) = self.overflowing_add(other);
+            let (abc, of2) = ab.overflowing_add(Self::from_bool(carry));
+            // `of1 && of2` is possible with signed integers if a negative sum
+            // overflows to `MAX` and adding the carry overflows again back to `MIN`
+            (abc, of1 ^ of2)
+        }
+
+        fn borrowing_sub(self, other: Self, borrow: bool) -> (Self, bool) {
+            let (ab, of1) = self.overflowing_sub(other);
+            let (abc, of2) = ab.overflowing_sub(Self::from_bool(borrow));
+            (abc, of1 ^ of2)
+        }
     };
 }
 

From a322da6e2f3a756032ad0da49218933ef4c5a9c6 Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Mon, 30 Jun 2025 18:04:28 +0300
Subject: [PATCH 2/3] add tests for u256 Sub and Shl

---
 libm-test/tests/u256.rs | 38 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/libm-test/tests/u256.rs b/libm-test/tests/u256.rs
index e224ccf7a..d1c5cfbcc 100644
--- a/libm-test/tests/u256.rs
+++ b/libm-test/tests/u256.rs
@@ -123,6 +123,42 @@ fn mp_u256_add() {
     }
 }
 
+#[test]
+fn mp_u256_sub() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+    let mut by = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let y = random_u256(&mut rng);
+        assign_bigint(&mut bx, x);
+        assign_bigint(&mut by, y);
+
+        // since the operators (may) panic on overflow,
+        // we should test something that doesn't
+        let actual = if x >= y { x - y } else { y - x };
+        bx -= &by;
+        bx.abs_mut();
+        check_one(|| hexu(x), || Some(hexu(y)), actual, &mut bx);
+    }
+}
+
+#[test]
+fn mp_u256_shl() {
+    let mut rng = ChaCha8Rng::from_seed(*SEED);
+    let mut bx = BigInt::new();
+
+    for _ in 0..bigint_fuzz_iteration_count() {
+        let x = random_u256(&mut rng);
+        let shift: u32 = rng.random_range(0..256);
+        assign_bigint(&mut bx, x);
+        let actual = x << shift;
+        bx <<= shift;
+        check_one(|| hexu(x), || Some(shift.to_string()), actual, &mut bx);
+    }
+}
+
 #[test]
 fn mp_u256_shr() {
     let mut rng = ChaCha8Rng::from_seed(*SEED);
@@ -130,7 +166,7 @@ fn mp_u256_shr() {
 
     for _ in 0..bigint_fuzz_iteration_count() {
         let x = random_u256(&mut rng);
-        let shift: u32 = rng.random_range(0..255);
+        let shift: u32 = rng.random_range(0..256);
         assign_bigint(&mut bx, x);
         let actual = x >> shift;
         bx >>= shift;

From 75c1707d6528a453a69b28a5a2f0149d24587380 Mon Sep 17 00:00:00 2001
From: quaternic <57393910+quaternic@users.noreply.github.com>
Date: Tue, 1 Jul 2025 01:40:14 +0300
Subject: [PATCH 3/3] add icount benchmarks for u256 Sub and Shl

---
 libm-test/benches/icount.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/libm-test/benches/icount.rs b/libm-test/benches/icount.rs
index a0928a29f..02ee13f80 100644
--- a/libm-test/benches/icount.rs
+++ b/libm-test/benches/icount.rs
@@ -119,6 +119,22 @@ fn icount_bench_u256_add(cases: Vec<(u256, u256)>) {
     }
 }
 
+#[library_benchmark]
+#[bench::linspace(setup_u256_add())]
+fn icount_bench_u256_sub(cases: Vec<(u256, u256)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x) - black_box(y));
+    }
+}
+
+#[library_benchmark]
+#[bench::linspace(setup_u256_shift())]
+fn icount_bench_u256_shl(cases: Vec<(u256, u32)>) {
+    for (x, y) in cases.iter().copied() {
+        black_box(black_box(x) << black_box(y));
+    }
+}
+
 #[library_benchmark]
 #[bench::linspace(setup_u256_shift())]
 fn icount_bench_u256_shr(cases: Vec<(u256, u32)>) {
@@ -129,7 +145,7 @@ fn icount_bench_u256_shr(cases: Vec<(u256, u32)>) {
 
 library_benchmark_group!(
     name = icount_bench_u128_group;
-    benchmarks = icount_bench_u128_widen_mul, icount_bench_u256_add, icount_bench_u256_shr
+    benchmarks = icount_bench_u128_widen_mul, icount_bench_u256_add, icount_bench_u256_sub, icount_bench_u256_shl, icount_bench_u256_shr
 );
 
 #[library_benchmark]