A Cuda/C++ starter cheatsheet
Harward and software
- Thread block and grid are
logical threads
, make programming easy. - In hardware, each GPU made of lots of
streaming multiprocessor
(hardware), which have lots of threads.
Concepts
kernel
: the code (function) run on GPU
- one kernel, only have one grid, grid have blocks, block has threads.
thread
, block
,grid
threadIdx
: each thread have a unique id, threadIdx.x
, .y
,.z
blockIdx
: each block have a unique id, blockIdx.x
, .y
,.z
- dimemnsion size:
blockDim.x
,gridDim.x
, .y
,.z
thread management
Stream
Each GPU made of lots of Streams
(hardware). When a kernel grid activate, multi block will assign blocks to avaibable stream
to run.
Warp
For SM(hardware), CUDA run as warp(线程束), SM don’t know where the block, who they are.
In hardware, the thread resource are limited, not all logical threads
run at the same time. The minimun physical threads
run at the same time are called warp
.
for example: if one block assigned 128 threads, when running on Stream
, this block divied into
1
2
3
4
| warp0: thread 0,........thread31
warp1: thread 32,........thread63
warp2: thread 64,........thread95
warp3: thread 96,........thread127
|
special keywords
device: GPU
host: CPU
compile with nvcc
, not gcc
keyword | execution | called by | other |
---|
__global__ | device | device or host | must return void |
__device__ | device | device | |
__host__ | host | host | defaut, could omit |
kernel launch: <<<Dg,Db,Ns,S>>>
- Dg (dim3): specifies the dimension and size of the grid.
- Db (dim3): specifies the dimension and size of each block
- Ns (size_t): specifies the number of bytes in shared memory that is dynamically allocated per block for this call in addition to the statically allocated memory.
- S (cudaStream_t): specifies the associated stream, is an optional parameter which defaults to 0.
from docs
Vertors
init 1 dim vector
1
2
3
4
5
6
7
8
9
10
11
12
13
14
| #include <stdio.h>
__global__
void initWith(float num, float *a, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
a[i] = num;
}
}
|
3 dimension index
1
| tid=threadIdx.x+threadIdx.y*blockDim.x+threadIdx.z*blockDim.x*blockDim.y
|
vectorized addition
1
2
3
4
5
6
7
8
9
10
11
| __global__
void addVectorsInto(float *result, float *a, float *b, int N)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
result[i] = a[i] + b[i];
}
}
|
Common Program Flow
1. Get device
1
2
3
4
5
| int deviceId;
int numberOfSMs;
// get device
cudaGetDevice(&deviceId);
cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId);
|
3. GPU memory allocation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
| const int N = 2<<24;
size_t size = N * sizeof(float);
float *a;
float *b;
float *c;
// GPU memory allocation
cudaMallocManaged(&a, size);
cudaMallocManaged(&b, size);
cudaMallocManaged(&c, size);
// send to GPU
cudaMemPrefetchAsync(a, size, deviceId);
cudaMemPrefetchAsync(b, size, deviceId);
cudaMemPrefetchAsync(c, size, deviceId);
size_t threadsPerBlock;
size_t numberOfBlocks;
// why 32, beacause warp is usually 32 on the hardware design side.
// device run more effeciently
threadsPerBlock = 256;
numberOfBlocks = 32 * numberOfSMs;
cudaError_t addVectorsErr; // error handling
cudaError_t asyncErr;
|
4. Create streams
1
2
3
4
5
6
7
| /*
* Create 3 streams to run initialize the 3 data vectors in parallel.
*/
cudaStream_t stream1, stream2, stream3;
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
cudaStreamCreate(&stream3);
|
5. Kernel launch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
| /*
* Give each `initWith` launch its own non-standard stream.
* note the <<< >>>: also called a “kernel launch”
*/
initWith<<<numberOfBlocks, threadsPerBlock, 0, stream1>>>(3, a, N);
initWith<<<numberOfBlocks, threadsPerBlock, 0, stream2>>>(4, b, N);
initWith<<<numberOfBlocks, threadsPerBlock, 0, stream3>>>(0, c, N);
// run
addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N);
addVectorsErr = cudaGetLastError();
if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr));
// critical !!!
asyncErr = cudaDeviceSynchronize();
if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr));
|
6. fetch data and run on CPU
1
2
3
4
| // fetch data to CPU memory
cudaMemPrefetchAsync(c, size, cudaCpuDeviceId);
// run a func in CPU
checkElementsAre(7, c, N);
|
7. free memory
1
2
3
4
5
6
7
8
9
10
11
| /*
* Destroy streams when they are no longer needed.
*/
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaStreamDestroy(stream3);
// free GPU memory
cudaFree(a);
cudaFree(b);
cudaFree(c);
|
Here is a function run on CPU
1
2
3
4
5
6
7
8
9
10
11
12
| void checkElementsAre(float target, float *vector, int N)
{
for(int i = 0; i < N; i++)
{
if(vector[i] != target)
{
printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target);
exit(1);
}
}
printf("Success! All values calculated correctly.\n");
}
|