C++ Notes: CUDA

zqfang included in Coding

2020-07-11 790 words 2 minutes

Contents

A Cuda/C++ starter cheatsheet

Harward and software

Thread block and grid are logical threads, make programming easy.
In hardware, each GPU made of lots of streaming multiprocessor(hardware), which have lots of threads.

Concepts

kernel: the code (function) run on GPU
- one kernel, only have one grid, grid have blocks, block has threads.
thread, block,grid
- threadIdx: each thread have a unique id, threadIdx.x, .y,.z
- blockIdx: each block have a unique id, blockIdx.x, .y,.z
- dimemnsion size: blockDim.x,gridDim.x, .y,.z

thread management

Stream

Each GPU made of lots of Streams(hardware). When a kernel grid activate, multi block will assign blocks to avaibable stream to run.

Warp

For SM(hardware), CUDA run as warp(线程束), SM don’t know where the block, who they are.

In hardware, the thread resource are limited, not all logical threads run at the same time. The minimun physical threads run at the same time are called warp.

for example: if one block assigned 128 threads, when running on Stream, this block divied into

1
2
3
4
warp0: thread  0,........thread31
warp1: thread 32,........thread63
warp2: thread 64,........thread95
warp3: thread 96,........thread127

special keywords

device: GPU
host: CPU
compile with nvcc, not gcc

keyword	execution	called by	other
`__global__`	device	device or host	must `return void`
`__device__`	device	device
`__host__`	host	host	defaut, could omit

kernel launch: <<<Dg,Db,Ns,S>>>

Dg (dim3): specifies the dimension and size of the grid.
Db (dim3): specifies the dimension and size of each block
Ns (size_t): specifies the number of bytes in shared memory that is dynamically allocated per block for this call in addition to the statically allocated memory.
S (cudaStream_t): specifies the associated stream, is an optional parameter which defaults to 0.

from docs

Vertors

init 1 dim vector

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
#include <stdio.h>

__global__
void initWith(float num, float *a, int N)
{

  int index = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < N; i += stride)
  {
    a[i] = num;
  }
}

3 dimension index

1
tid=threadIdx.x+threadIdx.y*blockDim.x+threadIdx.z*blockDim.x*blockDim.y

vectorized addition

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
__global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
  int index = threadIdx.x + blockIdx.x * blockDim.x;
  int stride = blockDim.x * gridDim.x;

  for(int i = index; i < N; i += stride)
  {
    result[i] = a[i] + b[i];
  }
}

Common Program Flow

1. Get device

1
2
3
4
5
  int deviceId;
  int numberOfSMs;
  // get device
  cudaGetDevice(&deviceId);
  cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);

3. GPU memory allocation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
  const int N = 2<<24;
  size_t size = N * sizeof(float);

  float *a;
  float *b;
  float *c;
  
  // GPU memory allocation
  cudaMallocManaged(&a, size);
  cudaMallocManaged(&b, size);
  cudaMallocManaged(&c, size);
  
  // send to GPU
  cudaMemPrefetchAsync(a, size, deviceId);
  cudaMemPrefetchAsync(b, size, deviceId);
  cudaMemPrefetchAsync(c, size, deviceId);

  size_t threadsPerBlock;
  size_t numberOfBlocks;

  // why 32, beacause warp is usually 32 on the hardware design side.
  // device run more effeciently
  threadsPerBlock = 256;
  numberOfBlocks = 32 * numberOfSMs;

  cudaError_t addVectorsErr; // error handling
  cudaError_t asyncErr;

4. Create streams

1
2
3
4
5
6
7
  /*
   * Create 3 streams to run initialize the 3 data vectors in parallel.
   */
  cudaStream_t stream1, stream2, stream3;
  cudaStreamCreate(&stream1);
  cudaStreamCreate(&stream2);
  cudaStreamCreate(&stream3);

5. Kernel launch

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
  /*
   * Give each `initWith` launch its own non-standard stream.
   * note the <<< >>>: also called a “kernel launch”
   */
  initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);
  initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);
  initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);

  // run
  addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);

  addVectorsErr = cudaGetLastError();
  if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
  
  // critical !!!
  asyncErr = cudaDeviceSynchronize();
  if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));

6. fetch data and run on CPU

1
2
3
4
  // fetch data to CPU memory
  cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
  // run a func in CPU
  checkElementsAre(7, c, N);

7. free memory

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
  /*
   * Destroy streams when they are no longer needed.
   */
  cudaStreamDestroy(stream1);
  cudaStreamDestroy(stream2);
  cudaStreamDestroy(stream3);

  // free GPU memory
  cudaFree(a);
  cudaFree(b);
  cudaFree(c);

Here is a function run on CPU

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
void checkElementsAre(float target, float *vector, int N)
{
  for(int i = 0; i < N; i++)
  {
    if(vector[i] != target)
    {
      printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
      exit(1);
    }
  }
  printf("Success! All values calculated correctly.\n");
}