@@ -42,47 +42,25 @@ const lowp int packed_dim = unhash_packed_dim(t_layout);
42
42
* Extends sign of int8
43
43
*/
44
44
int extend_sign(int x) {
45
- if (x >> 7 == 1 ) {
46
- return x | 0xFFFFFF00;
47
- }
48
- return x;
45
+ return x | mix (0 , 0xFFFFFF00, x >= (1 << 7 ));
49
46
}
50
47
51
48
ivec4 read_texel(ivec4 tidx) {
52
- ivec4 tidx_to_use = tidx;
53
- ivec4 sizes_to_use = sizes;
54
- int packed_dim_to_use = packed_dim;
55
- if (transpose_hw == 1 ) {
56
- sizes_to_use.xy = sizes_to_use.yx;
57
- tidx_to_use.xy = tidx.yx;
58
-
59
- if (packed_dim == 1 ) {
60
- packed_dim_to_use = 0 ;
61
- }
62
- if (packed_dim == 0 ) {
63
- packed_dim_to_use = 1 ;
64
- }
65
- }
49
+ const ivec4 tidx_to_use = ivec4 (mix (tidx.xy, tidx.yx, bvec2 (transpose_hw == 1 )), tidx.zw);
50
+ const ivec4 sizes_to_use = ivec4 (mix (sizes.xy, sizes.yx, bvec2 (transpose_hw == 1 )), sizes.zw);
51
+ const int packed_dim_to_use = mix (packed_dim, packed_dim ^ transpose_hw, packed_dim < 2 );
66
52
67
53
const ivec4 buf_indices = tidx_to_nchwi(
68
54
tidx_to_use, sizes_to_use, packed_dim_to_use);
69
55
70
- int shift = (1 << 8 ) - 1 ;
71
- ivec4 masks;
72
- // Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
73
- // little endian is assumed, as most processors use little endian. Thus the
74
- // most significant bytes correspond to the "latter" packed values.
75
- masks.x = shift << (8 * (buf_indices.x % 4 ));
76
- masks.y = shift << (8 * (buf_indices.y % 4 ));
77
- masks.z = shift << (8 * (buf_indices.z % 4 ));
78
- masks.w = shift << (8 * (buf_indices.w % 4 ));
56
+ const int mask = (1 << 8 ) - 1 ;
79
57
80
58
ivec4 out_tex = ivec4 (0 );
81
59
82
60
[[unroll]] for (int i = 0 ; i < 4 ; ++ i) {
83
61
if (tidx[packed_dim] + i < sizes[packed_dim]) {
84
- int in_texel = nchw_in[buf_indices[i] / 4 ];
85
- int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4 )) ;
62
+ const int in_texel = nchw_in[buf_indices[i] >> 2 ];
63
+ int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3 ))) & mask ;
86
64
extracted_val = extend_sign(extracted_val);
87
65
out_tex[i] = extracted_val;
88
66
}
0 commit comments