diff --git a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py index b4337829d7..c415249383 100644 --- a/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py +++ b/backends/vulkan/_passes/squeeze_unsqueeze_inputs.py @@ -32,7 +32,13 @@ def should_squeeze(self, op, shape: List[int]) -> bool: # pyre-ignore return shape[1] == 1 and shape[0] > 1 if len(shape) == 4: # No need to squeeze if all dims are 1 except the width dim - if all(dim == 1 for dim in shape[:-1]): + if shape[0] == shape[1] == shape[2] == 1: + return False + # No need to squeeze if batch and channel dims are 1 and height and width are > 1 + if shape[0] == shape[1] == 1 and shape[2] > 1 and shape[3] > 1: + return False + # No need to squeeze if batch dim is 1 and channel, height and width are > 1 + if shape[0] == 1 and shape[1] > 1 and shape[2] > 1 and shape[3] > 1: return False # Otherwise, check for squeezable dim return 1 in shape[:-1] diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl index 3265a97398..0ee19206f5 100644 --- a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl +++ b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_output_tile.glsl @@ -47,11 +47,6 @@ layout(push_constant) uniform restrict Block { layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; -// For performance improvement, reduce register usage by caching positions in shared memory. -// Offset index by 1 every 16 points to avoid bank access conflict. -#define offset_pos_index(index) (index + ((index) >> 4)) -shared ivec3 pos_shared[offset_pos_index(LOCAL_WG_SIZE)]; - /* * Computes a depthwise convolution. Each shader invocation calculates the * output at a single output location. @@ -77,8 +72,6 @@ void main() { return; } - pos_shared[offset_pos_index(gl_LocalInvocationIndex)] = pos; - // Compute the index of the top-left element of the overlay region. Negative // indices indicate that the top-left element is in a region added by padding. const ivec2 ipos = pos.xy * stride - padding; @@ -89,13 +82,10 @@ void main() { const ivec2 end = ipos + overlay_region.xy; // sum outputs - VEC4_T sum[BATCH_SIZE_Y][BATCH_SIZE_X]; + VEC4_T sum[BATCH_SIZE_Y * BATCH_SIZE_X]; - sum[0][0] = texelFetch(t_bias, ivec2(pos.z, 0), 0); - for (int y = 0; y < BATCH_SIZE_Y; y++) { - for (int x = 0; x < BATCH_SIZE_X; x++) { - sum[y][x] = sum[0][0]; - } + for (int i = 0; i < BATCH_SIZE_Y * BATCH_SIZE_X; i++) { + sum[i] = VEC4_T(0); } // array to store input texels @@ -115,7 +105,7 @@ void main() { if (i > 0) { for (int j = 0; j < TILE_SIZE; j++) { for (int s = 0; s < BATCH_SIZE_X; s++) { - sum[1][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[1][s]); + sum[BATCH_SIZE_X + s] = fma(in_texels[j + s], prev_kernel_line[j], sum[BATCH_SIZE_X + s]); } } } @@ -125,19 +115,19 @@ void main() { for (int j = 0; j < TILE_SIZE; j++, kx++) { prev_kernel_line[j] = texelFetch(t_kernel, ivec2(kx, pos.z), 0); for (int s = 0; s < BATCH_SIZE_X; s++) { - sum[0][s] = fma(in_texels[j + s], prev_kernel_line[j], sum[0][s]); + sum[s] = fma(in_texels[j + s], prev_kernel_line[j], sum[s]); } } } } - const ivec3 out_pos = pos_shared[offset_pos_index(gl_LocalInvocationIndex)]; + const VEC4_T bias = texelFetch(t_bias, ivec2(pos.z, 0), 0); for (int y = 0; y < BATCH_SIZE_Y; y++) { for (int x = 0; x < BATCH_SIZE_X; x++) { - if (any(greaterThanEqual(ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), out_limits.xyz))) { - continue; + const ivec3 out_pos = ivec3(pos.x + x, pos.y + y, pos.z); + if (all(lessThan(out_pos.xy, out_limits.xy))) { + imageStore(t_out, out_pos, op(sum[y * BATCH_SIZE_X + x] + bias, out_min, out_max)); } - imageStore(t_out, ivec3(out_pos.x + x, out_pos.y + y, out_pos.z), op(sum[y][x], out_min, out_max)); } } }