PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $...

43

Transcript of PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $...

Page 1: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application
Page 2: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 3: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 4: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

$ cd ex

$ cd 1.Enumerate_GPUs

$ make

$ ./enum_gpu.x

Enter in the application folder

Compile the source code

Launch the application

Page 5: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

--- General Information for device 0 ---Name: xxxxCompute capability: xxxxClock rate: xxxxDevice copy overlap: xxxxKernel execution timeout : xxxx

--- Memory Information for device 0 ---Total global mem: xxxxTotal constant Mem: xxxxMax mem pitch: xxxxTexture Alignment: xxxx

--- MP Information for device 0 ---Multiprocessor count: xxxxShared mem per mp: xxxxRegisters per mp: xxxxThreads in warp: xxxxMax threads per block: xxxxMax thread dimensions: (xxxx, xxxx, xxxx)Max grid dimensions: (xxxx, xxxx, xxxx)

Page 6: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 7: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 8: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

$ cd ..

$ cd 2.Hello_World

$ make

Page 9: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

int main(void) {printf("Hello World!\n");return 0;

}

$./hello_world.x

Page 10: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

•–

•–

–•

•–

__global__ void kernel(void) {}

int main(void) {kernel<<<1,1>>>();printf("Hello World!\n");return 0;

}

$./simple_kernel.x

Page 11: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 12: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

__global__ void add(int *a, int *b, int *c) {*c = *a + *b;

}

Page 13: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 14: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

int main(void) {int a, b, c; // host copies of a, b, cint *d_a, *d_b, *d_c; // device copies of a, b, cint size = sizeof(int);

// Allocate space for device copies of a, b, ccudaMalloc((void **)&d_a, size);cudaMalloc((void **)&d_b, size);cudaMalloc((void **)&d_c, size);

// Setup input valuesa = 2;b = 7;

Page 15: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

// Copy inputs to device cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPU add<<<1,1>>>(d_a, d_b, d_c);

// Copy result back to host cudaMemcpy(&c, d_c, size, cudaMemcpyDeviceToHost);

// Cleanup cudaFree(d_a); cudaFree(d_b); cudaFree(d_c); return 0;

}

Page 16: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

add<<< 1, 1 >>>();

add<<< N, 1 >>>();

Page 17: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

__global__ void add(int *a, int *b, int *c) {c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];

}

Page 18: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

__global__ void add(int *a, int *b, int *c) {c[blockIdx.x] = a[blockIdx.x] + b[blockIdx.x];

}

Page 19: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

#define N 512int main(void) {

int *a, *b, *c; // host copies of a, b, cint *dev_a, *dev_b, *dev_c; // device copies of a, b, cint size = N * sizeof(int);….

// Alloc space for device copies of a, b, ccudaMalloc((void **)&dev_a, size);cudaMalloc((void **)&dev_b, size);cudaMalloc((void **)&dev_c, size);

// Alloc space for host copies of a, b, c and setup input valuesa = (int *)malloc(size); random_ints(a, N);b = (int *)malloc(size); random_ints(b, N);c = (int *)malloc(size);

Page 20: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

// Copy inputs to devicecudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPU with N blocksadd<<<N,1>>>(dev_a, dev_b, dev_c);

// Copy result back to hostcudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

// Cleanupfree(a); free(b); free(c);cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);return 0;

}

Page 21: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

•–

•–

•–

•–

Page 22: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

__global__ void add(int *a, int *b, int *c) {c[threadIdx.x] = a[threadIdx.x] + b[threadIdx.x];

}

Page 23: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

#define N 512int main(void) {

int *a, *b, *c; // host copies of a, b, cint *dev_a, *dev_b, *dev_c; // device copies of a, b, cint size = N * sizeof(int);…

// Alloc space for device copies of a, b, ccudaMalloc((void **)&dev_a, size);cudaMalloc((void **)&dev_b, size);cudaMalloc((void **)&dev_c, size);

// Alloc space for host copies of a, b, c and setup input valuesa = (int *)malloc(size); random_ints(a, N);b = (int *)malloc(size); random_ints(b, N);c = (int *)malloc(size);

Page 24: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

// Copy inputs to devicecudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPU with N threadsadd<<<1,N>>>(dev_a, dev_b, dev_c);

// Copy result back to hostcudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

// Cleanupfree(a); free(b); free(c);cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);return 0;

}

Page 25: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 26: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

int index = threadIdx.x + blockIdx.x * M;

Page 27: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

int index = threadIdx.x + blockIdx.x * M;= 5 + 2 * 8;= 21;

Page 28: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

int index = threadIdx.x + blockIdx.x * blockDim.x;

__global__ void add(int *a, int *b, int *c) { int index = threadIdx.x + blockIdx.x * blockDim.x; c[index] = a[index] + b[index];

}

Page 29: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

#define N 512int main(void) {

int *a, *b, *c; // host copies of a, b, cint *dev_a, *dev_b, *dev_c; // device copies of a, b, cint size = N * sizeof(int);…

// Alloc space for device copies of a, b, ccudaMalloc((void **)&dev_a, size);cudaMalloc((void **)&dev_b, size);cudaMalloc((void **)&dev_c, size);

// Alloc space for host copies of a, b, c and setup input valuesa = (int *)malloc(size); random_ints(a, N);b = (int *)malloc(size); random_ints(b, N);c = (int *)malloc(size);

Page 30: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

// Copy inputs to devicecudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);

// Launch add() kernel on GPUadd<<<N/THREADS_PER_BLOCK,THREADS_PER_BLOCK>>>(dev_a, dev_b, dev_c);

// Copy result back to hostcudaMemcpy(c, dev_c, size, cudaMemcpyDeviceToHost);

// Cleanupfree(a); free(b); free(c);cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_c);return 0;

}

Page 31: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

__global__ void add(int *a, int *b, int *c, int n) { int index = threadIdx.x + blockIdx.x * blockDim.x; if (index < n)

c[index] = a[index] + b[index]; }

add<<<(N + M-1) / M,M>>>(d_a, d_b, d_c, N);

Page 32: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 33: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

radius radius

Page 34: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

Page 35: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

• __shared__

Page 36: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

blockDim.x output elements

halo on left halo on right

Page 37: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

__global__ void stencil_1d(int *in, int *out) {

__shared__ int temp[/* WHAT SIZE? */];

int gindex = threadIdx.x + blockIdx.x * blockDim.x;

int lindex = threadIdx.x + RADIUS;

// Read input elements into shared memory

temp[lindex] = in[gindex];

if (threadIdx.x < RADIUS) {

temp[lindex - RADIUS] = in[gindex - RADIUS];

temp[lindex + BLOCK_SIZE] =

in[gindex + BLOCK_SIZE];

}

Question? Find the right vector size and complete

the code

Page 38: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

// Apply the stencil

int result = 0;

for (int offset = -RADIUS ; offset <= RADIUS ; offset++)

result += temp[lindex + offset];

// Store the result

out[gindex-RADIUS] = result;

}

Page 39: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

The stencil example will not work…

Suppose thread 15 reads the halo before thread 0 has fetched it…

temp[lindex] = in[gindex];

if (threadIdx.x < RADIUS) {

temp[lindex – RADIUS = in[gindex – RADIUS];

temp[lindex + BLOCK_SIZE] = in[gindex + BLOCK_SIZE];

}

int result = 0;

result += temp[lindex + 1];

Store at temp[18]

Load from temp[19]

Skipped, threadIdx > RADIUS

Page 40: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

• void __syncthreads();

• __syncthreads()

Page 41: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

– __global__ void matmatmulgpu( double *a, double *b, double *c, int lda ){unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;

// assign one thread per output - row major over whole matrix int row = /* WHAT INDEX? */ ; int col = /* WHAT INDEX? */ ; double sum = 0.0;

for ( int k=0; k<lda; k++ ) { sum += a[row*lda+k] * b[k*lda+col];

} c[tid] = sum; return;

}

Page 42: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application

// This kernel is optimized to ensure all global reads and writes are coalesced,// and to avoid bank conflicts in shared memory. This kernel is up to 11x faster// than the naive kernel below. Note that the shared memory array is sized to // (BLOCK_DIM+1)*BLOCK_DIM. This pads each row of the 2D block in shared memory // so that bank conflicts do not occur when threads address the array column-wise.__global__ void transpose(float *odata, float *idata, int width, int height){

__shared__ float block[BLOCK_DIM][BLOCK_DIM+1];

// read the matrix tile into shared memory // load one element per thread from device memory (idata) and store it // in transposed order in block[][]

unsigned int xIndex = /* COMPLETE */ ;unsigned int yIndex = /* COMPLETE */ ;

if((xIndex < width) && (yIndex < height)){

unsigned int index_in = /* COMPLETE */;block[threadIdx.y][threadIdx.x] = /* COMPLETE */;

}

// synchronise to ensure all writes to block[][] have completed /* COMPLETE */ ;

// write the transposed matrix tile to global memory (odata) in linear order

xIndex = /* COMPLETE */ ;yIndex = /* COMPLETE */ ;if((xIndex < height) && (yIndex < width)){unsigned int index_out = /* COMPLETE */ ;odata[index_out] = /* COMPLETE */ ;}

}

Page 43: PowerPoint Presentationgmichi/asocd/exercises/ex_04.pdf · $ cd ex $ cd 1.Enumerate_GPUs $ make $ ./enum_gpu.x Enter in the application folder Compile the source code Launch the application