Closed
Description
utf8LengthByLeader
is widely used, appearing in most functions that perform iteration as it's used in stream
, iter
, etc. The current definition is
text/src/Data/Text/Internal/Encoding/Utf8.hs
Lines 83 to 99 in 5e57460
However,
- In most cases where this is used, one needs to read the next
Char
. This is done by branching on the result and checking if it's 1, 2, 3, or something else (4). This means that everythingutf8LengthByLeader
does is a waste! One can simply branch on the leading byte, as shown in the comment above. - The definition does not seem to be very efficient, at least on my machine. I have a couple of alternative branchless versions that are consistently faster.
Here are the implementations:
utf8LengthByLeader_branching :: Word8 -> Int
utf8LengthByLeader_branching w
| w < 0x80 = 1
| w < 0xE0 = 2
| w < 0xF0 = 3
| otherwise = 4
{-# INLINE utf8LengthByLeader_branching #-}
utf8LengthByLeader_simple :: Word8 -> Int
utf8LengthByLeader_simple (W8# w8#) =
let w# = word8ToWord# w8#
in 1
+ I# (w# `geWord#` 0x80##)
+ I# (w# `geWord#` 0xE0##)
+ I# (w# `geWord#` 0xF0##)
{-# INLINE utf8LengthByLeader_simple #-}
utf8LengthByLeader_lookup :: Word8 -> Int
utf8LengthByLeader_lookup (W8# w8#) =
I# (int8ToInt# (indexInt8OffAddr# a# (word2Int# (uncheckedShiftRL# (word8ToWord# w8#) 4#))))
where
a# = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"#
-- 0..7 -> 1
-- 8..B -> invalid leading byte
-- C..D -> 2
-- E -> 3
-- F -> 4
{-# INLINE utf8LengthByLeader_lookup #-}
I benchmarked the implementations on two situations:
sum
: branches on the resultlength
: does not branch on the result
The results are
100 ascii
sum
current: 242 ns ± 23 ns
branching: 70.6 ns ± 5.3 ns, 0.29x
simple: 162 ns ± 11 ns, 0.67x
lookup: 108 ns ± 5.2 ns, 0.45x
length
current: 467 ns ± 43 ns
branching: 56.7 ns ± 5.5 ns, 0.12x
simple: 306 ns ± 24 ns, 0.65x
lookup: 288 ns ± 21 ns, 0.62x
100 random
sum
current: 331 ns ± 21 ns
branching: 153 ns ± 10 ns, 0.46x
simple: 227 ns ± 21 ns, 0.69x
lookup: 188 ns ± 11 ns, 0.57x
length
current: 466 ns ± 43 ns
branching: 152 ns ± 12 ns, 0.33x
simple: 307 ns ± 23 ns, 0.66x
lookup: 285 ns ± 5.5 ns, 0.61x
100000 ascii
sum
current: 226 μs ± 11 μs
branching: 57.4 μs ± 5.4 μs, 0.25x
simple: 148 μs ± 11 μs, 0.65x
lookup: 95.0 μs ± 5.3 μs, 0.42x
length
current: 447 μs ± 43 μs
branching: 44.1 μs ± 2.8 μs, 0.10x
simple: 279 μs ± 27 μs, 0.62x
lookup: 272 μs ± 22 μs, 0.61x
100000 random
sum
current: 1.00 ms ± 87 μs
branching: 566 μs ± 46 μs, 0.57x
simple: 830 μs ± 45 μs, 0.83x
lookup: 850 μs ± 59 μs, 0.85x
length
current: 452 μs ± 45 μs
branching: 537 μs ± 49 μs, 1.19x
simple: 282 μs ± 24 μs, 0.62x
lookup: 274 μs ± 23 μs, 0.61x
So I think it would be good to
- Add a branching version of
utf8LengthByLeader
and use it where one needs to branch on the result - Replace the current version with either
simple
orlookup
.lookup
is sometimes faster but it means inlining that table everywhere.
It would also be good if someone can repeat these on a different CPU or arch to see if anything changes. The results above are from GHC 9.10.1 on an x86 Ryzen 5 3600.
Full benchmark source below.
Show
{- cabal:
build-depends: base, random, text, tasty-bench
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
{-# OPTIONS_GHC -ddump-simpl -ddump-to-file #-}
import qualified Data.List as L
import qualified Data.Text as T
import System.Random (mkStdGen, uniforms)
import qualified Data.Text.Array as A
import Data.Text.Internal (Text(..))
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader, chr2, chr3, chr4)
import Data.Text.Internal.Unsafe.Char (unsafeChr8)
import Data.Text.Unsafe (iterArray, Iter(..))
import GHC.Word (Word8(..))
import GHC.Exts
import Test.Tasty.Bench
main :: IO ()
main = defaultMain
[ env (pure (mk n)) $ \t ->
bgroup name
[ bgroup "sum"
[ bench "current" $ whnf (mkSum utf8LengthByLeader) t
, bcompare (currentName "sum") $
bench "branching" $ whnf (mkSum utf8LengthByLeader_branching) t
, bcompare (currentName "sum") $
bench "simple" $ whnf (mkSum utf8LengthByLeader_simple) t
, bcompare (currentName "sum") $
bench "lookup" $ whnf (mkSum utf8LengthByLeader_lookup) t
]
, bgroup "length"
[ bench "current" $ whnf (mkLength utf8LengthByLeader) t
, bcompare (currentName "length") $
bench "branching" $ whnf (mkLength utf8LengthByLeader_branching) t
, bcompare (currentName "length") $
bench "simple" $ whnf (mkLength utf8LengthByLeader_simple) t
, bcompare (currentName "length") $
bench "lookup" $ whnf (mkLength utf8LengthByLeader_lookup) t
]
]
| n <- [100, 100000]
, (typ, mk) <- [("ascii", mkAscii), ("random", mkRandom)]
, let name = show n ++ " " ++ typ
currentName which = "All." ++ name ++ "." ++ which ++ ".current"
]
----------------------
-- Benchmarked funcs
----------------------
-- Branches on the char length
mkSum :: (Word8 -> Int) -> Text -> Int
mkSum f = \(Text arr off len) -> loop arr (off + len) off 0
where
loop !a !n !i !acc
| i >= n = acc
| otherwise = loop a n (i + d) (acc + fromEnum c)
where
m0 = A.unsafeIndex a i
m1 = A.unsafeIndex a (i + 1)
m2 = A.unsafeIndex a (i + 2)
m3 = A.unsafeIndex a (i + 3)
d = f m0
c = case d of
1 -> unsafeChr8 m0
2 -> chr2 m0 m1
3 -> chr3 m0 m1 m2
_ -> chr4 m0 m1 m2 m3
{-# INLINE mkSum #-}
-- Does not branch on the char length
mkLength :: (Word8 -> Int) -> Text -> Int
mkLength f = \(Text arr off len) -> loop arr (off + len) off 0
where
loop !a !n !i !acc
| i >= n = acc
| otherwise = loop a n (i + f (A.unsafeIndex a i)) (acc + 1)
{-# INLINE mkLength #-}
---------
-- Data
---------
mkAscii :: Int -> Text
mkAscii n =
T.pack $
L.take n $
L.cycle $
"The quick brown fox jumps over the lazy dog\n"
-- Each Char's utf-8 byte length is random. This takes away any advantage
-- the branching impl might have due to branch prediction.
mkRandom :: Int -> Text
mkRandom n =
T.pack $
L.map mkC $
L.take n $
uniforms (mkStdGen 42)
where
mkC :: Int -> Char
mkC i = toEnum $ case i `mod` 4 of
0 -> 0x00007F
1 -> 0x0007FF
2 -> 0x00FFFF
3 -> 0x10FFFF
-----------------------------
-- utf8LengthByLeader impls
-----------------------------
utf8LengthByLeader_branching :: Word8 -> Int
utf8LengthByLeader_branching w
| w < 0x80 = 1
| w < 0xE0 = 2
| w < 0xF0 = 3
| otherwise = 4
{-# INLINE utf8LengthByLeader_branching #-}
utf8LengthByLeader_simple :: Word8 -> Int
utf8LengthByLeader_simple (W8# w8#) =
let w# = word8ToWord# w8#
in 1
+ I# (w# `geWord#` 0x80##)
+ I# (w# `geWord#` 0xE0##)
+ I# (w# `geWord#` 0xF0##)
{-# INLINE utf8LengthByLeader_simple #-}
utf8LengthByLeader_lookup :: Word8 -> Int
utf8LengthByLeader_lookup (W8# w8#) =
I# (int8ToInt# (indexInt8OffAddr# a# (word2Int# (uncheckedShiftRL# (word8ToWord# w8#) 4#))))
where
a# = "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"#
-- 0..7 -> 1
-- 8..B -> invalid leading byte
-- C..D -> 2
-- E -> 3
-- F -> 4
{-# INLINE utf8LengthByLeader_lookup #-}
Metadata
Metadata
Assignees
Labels
No labels