/* Lesson 1 --Code from Quiz */
#include <stdio.h>
__global__ void cube(float * d_out, float * d_in){
int thid= threadIdx.x;
float num = d_in[thid];
d_out[thid] = num * num * num;
}
int main(int argc, char ** argv) {
const int ARRAY_SIZE = 96;
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(float);
// generate the input array on the host
float h_in[ARRAY_SIZE];
for (int i = 0; i < ARRAY_SIZE; i++) {
h_in[i] = float(i);
}
float h_out[ARRAY_SIZE];
// declare GPU memory pointers
float * d_in;
float * d_out;
// allocate GPU memory
cudaMalloc((void**) &d_in, ARRAY_BYTES);
cudaMalloc((void**) &d_out, ARRAY_BYTES);
// transfer the array to the GPU
cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);
// launch the kernel
cube<<<1, ARRAY_SIZE>>>(d_out, d_in);
// copy back the result array to the CPU
cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);
// print out the resulting array
for (int i =0; i < ARRAY_SIZE; i++) {
printf("%f", h_out[i]);
printf(((i % 4) != 3) ? "\t" : "\n");
}
cudaFree(d_in);
cudaFree(d_out);
return 0;
}
/* LESSON 2 */
many threads solving a problem by working together
parallel communication patters
map one to one
transpose one to one
gather many to one
scatter one to many
stencil several to run
reduce all to one
scan/sort all to all
shared memory in block
shared global memory
thread specific memory
__synctreads(); is crucial when you seperate read/write values
must ensure all values are written before you can start reading thems
maximize arithmetic intensity math/memory
-minimize the time spent on memory per thread
-local memory > shared memory >> global memory
__shared__ float sh_array[128];
sh_array[index] = array[index] copies from global to shared
__syncthreads(); makes that operation is complete
make sure to coalesce memory --read from very close memory blocks
memory that has the lifetime of the threadblock
get an issue of having to write a lot of iterations to one array
has a class of functions called atomics
ie)
atomicAdd(&g[i],1) //adds 1 to g[i]
work around using atomicCAS() you can do anything!
__global__ void increment_naive(int *g)
{
// which thread is this?
int i = blockIdx.x * blockDim.x + threadIdx.x;
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
i = i % ARRAY_SIZE;
g[i] = g[i] + 1;
}
__global__ void increment_atomic(int *g)
{
// which thread is this?
int i = blockIdx.x * blockDim.x + threadIdx.x;
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE
i = i % ARRAY_SIZE;
atomicAdd(& g[i], 1);
}
//Summary
gather scatter stensil transpose
SM, threads blocks, ordering
local, global, shared, atomics
//Efficient GPU programming
higharithmetic intensity --move to faster memory if you need to
local > shared > global
use coalesced global memory if you need global
avoid diverging threads (bad if statment design and bad for loops)
forced syncing after loops. make em all go through it n times