Skip to content

Commit d32c5c7

Browse files
committed
[ET-VK] Minor performance improvements for buffer to int8 quantized packing.
Pull Request resolved: #12383 This diff provides minor performance improvements for buffer to int8 quantized packing in the Vulkan runtime graph ops. ghstack-source-id: 295679184 Differential Revision: [D74616519](https://our.internmc.facebook.com/intern/diff/D74616519/)
1 parent a05efc5 commit d32c5c7

File tree

1 file changed

+7
-29
lines changed

1 file changed

+7
-29
lines changed

backends/vulkan/runtime/graph/ops/glsl/nchw_to_bitw8_image_nobitw8buffer.glsl

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,47 +42,25 @@ const lowp int packed_dim = unhash_packed_dim(t_layout);
4242
* Extends sign of int8
4343
*/
4444
int extend_sign(int x) {
45-
if (x >> 7 == 1) {
46-
return x | 0xFFFFFF00;
47-
}
48-
return x;
45+
return x | mix(0, 0xFFFFFF00, x >= (1 << 7));
4946
}
5047

5148
ivec4 read_texel(ivec4 tidx) {
52-
ivec4 tidx_to_use = tidx;
53-
ivec4 sizes_to_use = sizes;
54-
int packed_dim_to_use = packed_dim;
55-
if (transpose_hw == 1) {
56-
sizes_to_use.xy = sizes_to_use.yx;
57-
tidx_to_use.xy = tidx.yx;
58-
59-
if (packed_dim == 1) {
60-
packed_dim_to_use = 0;
61-
}
62-
if (packed_dim == 0) {
63-
packed_dim_to_use = 1;
64-
}
65-
}
49+
const ivec4 tidx_to_use = ivec4(mix(tidx.xy, tidx.yx, bvec2(transpose_hw == 1)), tidx.zw);
50+
const ivec4 sizes_to_use = ivec4(mix(sizes.xy, sizes.yx, bvec2(transpose_hw == 1)), sizes.zw);
51+
const int packed_dim_to_use = mix(packed_dim, packed_dim ^ transpose_hw, packed_dim < 2);
6652

6753
const ivec4 buf_indices = tidx_to_nchwi(
6854
tidx_to_use, sizes_to_use, packed_dim_to_use);
6955

70-
int shift = (1 << 8) - 1;
71-
ivec4 masks;
72-
// Masks used to unpack 4x 8-bit values from a 32 bit integer. Note that
73-
// little endian is assumed, as most processors use little endian. Thus the
74-
// most significant bytes correspond to the "latter" packed values.
75-
masks.x = shift << (8 * (buf_indices.x % 4));
76-
masks.y = shift << (8 * (buf_indices.y % 4));
77-
masks.z = shift << (8 * (buf_indices.z % 4));
78-
masks.w = shift << (8 * (buf_indices.w % 4));
56+
const int mask = (1 << 8) - 1;
7957

8058
ivec4 out_tex = ivec4(0);
8159

8260
[[unroll]] for (int i = 0; i < 4; ++i) {
8361
if (tidx[packed_dim] + i < sizes[packed_dim]) {
84-
int in_texel = nchw_in[buf_indices[i] / 4];
85-
int extracted_val = (in_texel & masks[i]) >> (8 * (buf_indices[i] % 4));
62+
const int in_texel = nchw_in[buf_indices[i] >> 2];
63+
int extracted_val = (in_texel >> (8 * (buf_indices[i] & 3))) & mask;
8664
extracted_val = extend_sign(extracted_val);
8765
out_tex[i] = extracted_val;
8866
}

0 commit comments

Comments
 (0)