Skip to content

In-place shift of uint vectors corrupts s1 and further components #358

Open
@proski

Description

@proski

This is a duplicate of intel/compute-runtime#790 - moved here as there is evidence that the issue is compiler related.

The issue can be reproduced Intel Compute Runtime versions from 23.22.26516.18 until 24.45.31740.9 (inclusive on both ends). Versions 23.17.26241.22 and older are not affected. Version 24.48.31907.7 (currently the latest release) is not affected either. Even though the latest release is not affected, I'd like someone to have a closer look, as the fix might be accidental.

Following is an improved version of the demo I posted in the original ticket.

#define CL_TARGET_OPENCL_VERSION 300

#include <CL/opencl.h>
#include <stdio.h>

#define VECTOR_SIZE 4
#define INPUT_SIZE (3 * VECTOR_SIZE)

#define CODE(...) #__VA_ARGS__

const char *KernelSource = CODE(

    __kernel void shift_test(__global uint *input, __global uint *output) {
      uint4 d0 = vload4(0, input);
      uint4 d1 = vload4(1, input);
      uint4 d2 = vload4(2, input);
      d0 = (d0 << 1) | (d1 >> 31);
      d1 = (d1 << 1) | (d2 >> 31);
      d2 = d2 << 1;

      int i = get_global_id(0);
      if (i == 0) {
        vstore4(d0, 0, output);
        vstore4(d1, 1, output);
        vstore4(d2, 2, output);
      }
    }

);

int main(int argc, char **argv) {
  unsigned int data[INPUT_SIZE] = {10, 10, 10, 10, 11, 11,
                                   11, 11, 12, 12, 12, 12};
  unsigned int results[INPUT_SIZE];

  cl_platform_id platform;
  int err = clGetPlatformIDs(1, &platform, NULL);
  if (err < 0) {
    perror("Couldn't identify a platform");
    exit(1);
  }

  cl_device_id device_id;
  int gpu = 1;
  err = clGetDeviceIDs(platform, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU,
                       1, &device_id, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to create a device group!\n");
    return EXIT_FAILURE;
  }

  cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
  if (!context) {
    printf("Error: Failed to create a compute context!\n");
    return EXIT_FAILURE;
  }

  cl_command_queue commands =
      clCreateCommandQueueWithProperties(context, device_id, 0, &err);
  if (!commands) {
    printf("Error: Failed to create a command queue!\n");
    return EXIT_FAILURE;
  }

  cl_program program = clCreateProgramWithSource(
      context, 1, (const char **)&KernelSource, NULL, &err);
  if (!program) {
    printf("Error: Failed to create compute program!\n");
    return EXIT_FAILURE;
  }

  err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
  if (err != CL_SUCCESS) {
    size_t len;
    char *buffer;

    printf("Error: Failed to build program executable!\n");
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL,
                          &len);
    buffer = malloc(len);
    clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, len, buffer,
                          NULL);
    printf("%s\n", buffer);
    free(buffer);
    exit(1);
  }

  cl_kernel kernel = clCreateKernel(program, "shift_test", &err);
  if (!kernel || err != CL_SUCCESS) {
    printf("Error: Failed to create compute kernel!\n");
    exit(1);
  }

  cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY,
                                sizeof(int) * INPUT_SIZE, NULL, NULL);
  cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
                                 sizeof(int) * INPUT_SIZE, NULL, NULL);
  if (!output) {
    printf("Error: Failed to allocate device memory!\n");
    exit(1);
  }

  err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0,
                             sizeof(int) * INPUT_SIZE, data, 0, NULL, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to write to source array!\n");
    exit(1);
  }

  err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
  err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to set kernel arguments! %d\n", err);
    exit(1);
  }

  size_t local;
  err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
                                 sizeof(local), &local, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to retrieve kernel work group info! %d\n", err);
    exit(1);
  }

  size_t global = local;
  err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0,
                               NULL, NULL);
  if (err) {
    printf("Error: Failed to execute kernel!\n");
    return EXIT_FAILURE;
  }

  clFinish(commands);

  err = clEnqueueReadBuffer(commands, output, CL_TRUE, 0,
                            sizeof(int) * INPUT_SIZE, results, 0, NULL, NULL);
  if (err != CL_SUCCESS) {
    printf("Error: Failed to read output array! %d\n", err);
    exit(1);
  }

  for (int c = 0; c < VECTOR_SIZE; c++) {
    printf("result.s%d = %d:%d:%d\n", c, results[0 * VECTOR_SIZE + c],
           results[1 * VECTOR_SIZE + c], results[2 * VECTOR_SIZE + c]);
  }

  clReleaseMemObject(input);
  clReleaseMemObject(output);
  clReleaseProgram(program);
  clReleaseKernel(kernel);
  clReleaseCommandQueue(commands);
  clReleaseContext(context);

  return 0;
}

3 uint4 vectors (d0, d1 and d2) al loaded with identical values 10, 11 and 12. Then they are shifted by 1 bit in place, and then the top bit of the next vector (it should always be 0) is fed into the lower bit.

The expected result is that the values are multiplied by 2:

result.s0 = 20:22:24
result.s0 = 20:22:24
result.s0 = 20:22:24
result.s0 = 20:22:24

The actual output shows corruption of s1 and further components of the vectors that received the top bit from another vector.

result.s0 = 20:22:24
result.s1 = 0:0:24
result.s2 = 0:0:24
result.s3 = 0:0:24

The issue was originally observed in mfakto: primesearch/mfakto#15

mfakto saves the binary kernel and uses it as long as the configuration remains the same. If mfakto is run and then another version of Intel Compute Runtime is installed, the behavior of mfakto doesn't change, i.e. the behavior (buggy or correct) is captured in the compiled binary file. That makes me think that the issue is with the compiler.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions