The examples from dlsys-course

Bug: Shared memory need initialization with 0 before access

When n is not divisible by 512(threads per block), the offset here is wrong.

Here is my test code:

#include <cstdio>
#include <cuda_runtime.h>
#include <cassert>

#define RADIUS                3
#define THREADS_PER_BLOCK     512

__global__ void windowSumNaiveKernel(const float* A, float* B, int n) {
  int out_index = blockDim.x * blockIdx.x + threadIdx.x;
  int in_index = out_index + RADIUS;
  if (out_index < n) {
    float sum = 0.;
#pragma unroll
    for (int i = -RADIUS; i <= RADIUS; ++i) {
      sum += A[in_index + i];
    }
    B[out_index] = sum;
  }
}

__global__ void windowSumKernel(const float* A, float* B, int n) {
  __shared__ float temp[THREADS_PER_BLOCK + 2 * RADIUS];
  int out_index = blockDim.x * blockIdx.x + threadIdx.x;
  int in_index = out_index + RADIUS;
  int local_index = threadIdx.x + RADIUS;
  if (out_index < n) {
    temp[local_index] = A[in_index];
    if (threadIdx.x < RADIUS) {
      temp[local_index - RADIUS] = A[in_index - RADIUS];
      temp[local_index + THREADS_PER_BLOCK] = A[in_index +  THREADS_PER_BLOCK];
    }
    __syncthreads();
    float sum = 0.;
#pragma unroll
    for (int i = -RADIUS; i <= RADIUS; ++i) {
      sum += temp[local_index + i];
    }
    B[out_index] = sum;
  }
}

void windowSumNaive(const float* A, float* B, int n) {
    float *d_A, *d_B;
    int size = n * sizeof(float);
    cudaMalloc((void **) &d_A, (n + 2 * RADIUS) * sizeof(float));
    cudaMemset(d_A, 0, (n + 2 * RADIUS) * sizeof(float));
    cudaMemcpy(d_A + RADIUS, A, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_B, size);
    dim3 threads(THREADS_PER_BLOCK, 1, 1);
    dim3 blocks((n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1, 1);
    windowSumNaiveKernel<<<blocks, threads>>>(d_A, d_B, n);
    cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
    cudaFree(d_A);
    cudaFree(d_B);
}

void windowSum(const float* A, float* B, int n) {
    float *d_A, *d_B;
    int size = n * sizeof(float);
    cudaMalloc((void **) &d_A, (n + 2 * RADIUS) * sizeof(float));
    cudaMemset(d_A, 0, (n + 2 * RADIUS) * sizeof(float));
    cudaMemcpy(d_A + RADIUS, A, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_B, size);
    dim3 threads(THREADS_PER_BLOCK, 1, 1);
    dim3 blocks((n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK, 1, 1);
    windowSumKernel<<<blocks, threads>>>(d_A, d_B, n);
    cudaMemcpy(B, d_B, size, cudaMemcpyDeviceToHost);
    cudaFree(d_A);
    cudaFree(d_B);
}

// compute the result on cpu
void windowSumCpu(const float* A, float* B, int n) {
    for (int i = 0; i < n; ++i) {
        B[i] = 0;
        for (int j = max(0, i - RADIUS); j <= min(n-1, i + RADIUS); ++j) {
            B[i] += A[j];
        }
    }
}

int main() {
  // int n = 1024 * 1024;
  int n = 1000000;
  float* A = new float[n];
  float* B_cpu = new float[n];
  float* B_gpu1 = new float[n];
  float* B_gpu2 = new float[n];

  for (int i = 0; i < n; ++i) {
    A[i] = i;
  }
  windowSumCpu(A, B_cpu, n);
  windowSumNaive(A, B_gpu1, n);
  windowSum(A, B_gpu2, n);

  // error: int main(): Assertion `B_cpu[i] == B_gpu2[i]' failed.
  for (int i = 0; i < n; ++i) {
    assert(B_cpu[i] == B_gpu1[i]);
    assert(B_cpu[i] == B_gpu2[i]);
  }

  delete [] A;
  delete [] B_cpu;
  delete [] B_gpu1;
  delete [] B_gpu2;
  return 0;
}

There is two methods to solve the problem:

Init share memory to 0 before access.

__global__ void windowSumKernel(const float* A, float* B, int n) {
  __shared__ float temp[THREADS_PER_BLOCK + 2 * RADIUS];
  int out_index = blockDim.x * blockIdx.x + threadIdx.x;
  int in_index = out_index + RADIUS;
  int local_index = threadIdx.x + RADIUS;

  // Init share memory to 0 before access.
  if (threadIdx.x == 0) {
    for (int i = 0; i < THREADS_PER_BLOCK + 2 * RADIUS; ++i) {
      temp[i] = 0;
    }
  }
  __syncthreads();

  if (out_index < n) {
    temp[local_index] = A[in_index];
    if (threadIdx.x < RADIUS) {
      temp[local_index - RADIUS] = A[in_index - RADIUS];
      temp[local_index + THREADS_PER_BLOCK] = A[in_index +  THREADS_PER_BLOCK];
    }
    __syncthreads();
    float sum = 0.;
#pragma unroll
    for (int i = -RADIUS; i <= RADIUS; ++i) {
      sum += temp[local_index + i];
    }
    B[out_index] = sum;
  }
}

Use correct offset.

__global__ void windowSumKernel(const float* A, float* B, int n) {
  __shared__ float temp[THREADS_PER_BLOCK + 2 * RADIUS];
  int out_index = blockDim.x * blockIdx.x + threadIdx.x;
  int in_index = out_index + RADIUS;
  int local_index = threadIdx.x + RADIUS;
  if (out_index < n) {
    // compute the number of elements of every blocks
    int num = min(THREADS_PER_BLOCK, n - blockIdx.x * blockDim.x);
    temp[local_index] = A[in_index];
    if (threadIdx.x < RADIUS) {
      temp[local_index - RADIUS] = A[in_index - RADIUS];
      // use correct offset
      temp[local_index + num] = A[in_index +  num];
    }
    __syncthreads();
    float sum = 0.;
#pragma unroll
    for (int i = -RADIUS; i <= RADIUS; ++i) {
      sum += temp[local_index + i];
    }
    B[out_index] = sum;
  }
}

dlsys-course / examples Goto Github PK

examples's People

Contributors

Stargazers

Watchers

Forkers

examples's Issues

Bug: Shared memory need initialization with 0 before access

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs