In order to successfully complete this assignment you need to participate both individually and in groups during class. If you attend class in-person then have one of the instructors check your notebook and sign you out before leaving class on Wednesday March 10. If you are attending asynchronously, turn in your assignment using D2L no later than _11:59pm on Wednesday March 10.
Image from: https://www.appianimosaic.com/
%%writefile NCode/tiled_transpose.cu
#include <iostream>
#include <cuda.h>
#include <chrono>
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) { fprintf(stderr, "CUDA error: " #x " returned \"%s\"\n", cudaGetErrorString(cuda_error__)); fflush(stderr); exit(cuda_error__); } }
using namespace std;
const int BLOCKDIM = 32;
__global__ void transpose(const double *in_d, double * out_d, int row, int col)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < col && y < row)
out_d[y+col*x] = in_d[x+row*y];
}
__global__ void tiled_transpose(const double *in_d, double * out_d, int row, int col)
{
int x = blockIdx.x * BLOCKDIM + threadIdx.x;
int y = blockIdx.y * BLOCKDIM + threadIdx.y;
int x2 = blockIdx.y * BLOCKDIM + threadIdx.x;
int y2 = blockIdx.x * BLOCKDIM + threadIdx.y;
__shared__ double in_local[BLOCKDIM][BLOCKDIM];
__shared__ double out_local[BLOCKDIM][BLOCKDIM];
if (x < col && y < row) {
in_local[threadIdx.x][threadIdx.y] = in_d[x+row*y];
__syncthreads();
out_local[threadIdx.y][threadIdx.x] = in_local[threadIdx.x][threadIdx.y];
__syncthreads();
out_d[x2+col*y2] = out_local[threadIdx.x][threadIdx.y];
}
}
__global__ void transpose_symmetric(double *in_d, double * out_d, int row, int col)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < col && y < row) {
if (x < y) {
double temp = in_d[y+col*x];
in_d[y+col*x] = in_d[x+row*y];
in_d[x+row*y] = temp;
}
}
}
int main(int argc,char **argv)
{
std::cout << "Begin\n";
int sz_x=BLOCKDIM*300;
int sz_y=BLOCKDIM*300;
int nBytes = sz_x*sz_y*sizeof(double);
int block_size = BLOCKDIM;
double *m_h = (double *)malloc(nBytes);
double * in_d;
double * out_d;
int count = 0;
for (int i=0; i < sz_x*sz_y; i++){
m_h[i] = count;
count++;
}
std::cout << "Allocating device memory on host..\n";
CUDA_CALL(cudaMalloc((void **)&in_d,nBytes));
CUDA_CALL(cudaMalloc((void **)&out_d,nBytes));
//Set up blocks
dim3 dimBlock(block_size,block_size,1);
dim3 dimGrid(sz_x/block_size,sz_y/block_size,1);
std::cout << "Doing GPU Transpose\n";
CUDA_CALL(cudaMemcpy(in_d,m_h,nBytes,cudaMemcpyHostToDevice));
auto start_d = std::chrono::high_resolution_clock::now();
/**********************
//transpose<<<dimGrid,dimBlock>>>(in_d,out_d,sz_y,sz_x);
tiled_transpose<<<dimGrid,dimBlock>>>(in_d,out_d,sz_y,sz_x);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "\n\nError: %s\n\n", cudaGetErrorString(err)); fflush(stderr); exit(err);
}
CUDA_CALL(cudaMemcpy(m_h,out_d,nBytes,cudaMemcpyDeviceToHost));
************************/
/**********************/
transpose_symmetric<<<dimGrid,dimBlock>>>(in_d,out_d,sz_y,sz_x);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
fprintf(stderr, "\n\nError: %s\n\n", cudaGetErrorString(err)); fflush(stderr); exit(err);
}
CUDA_CALL(cudaMemcpy(m_h,in_d,nBytes,cudaMemcpyDeviceToHost));
/************************/
auto end_d = std::chrono::high_resolution_clock::now();
std::cout << "Doing CPU Transpose\n";
auto start_h = std::chrono::high_resolution_clock::now();
for (int y=0; y < sz_y; y++){
for (int x=y; x < sz_x; x++){
double temp = m_h[x+sz_x*y];
//std::cout << temp << " ";
m_h[x+sz_x*y] = m_h[y+sz_y*x];
m_h[y+sz_y*x] = temp;
}
//std::cout << "\n";
}
auto end_h = std::chrono::high_resolution_clock::now();
//Checking errors (should be same values as start)
count = 0;
int errors = 0;
for (int i=0; i < sz_x*sz_y; i++){
if (m_h[i] != count)
errors++;
count++;
}
std::cout << errors << " Errors found in transpose\n";
//Print Timing
std::chrono::duration<double> time_d = end_d - start_d;
std::cout << "Device time: " << time_d.count() << " s\n";
std::chrono::duration<double> time_h = end_h - start_h;
std::cout << "Host time: " << time_h.count() << " s\n";
cudaFree(in_d);
cudaFree(out_d);
return 0;
}
Overwriting NCode/tiled_transpose.cu
#Compile Cuda
!nvcc -std=c++11 -o tiled_transpose NCode/tiled_transpose.cu
nvcc: Command not found.
#Run Example
!./tiled_transpose
./tiled_transpose: Command not found.
As a group, lets see if we can optimize the code code from lasttime.
%%writefile wave_cuda.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf("CUDA error: " #x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}
__global__ void accel_update(double* d_dvdt, double* d_y, int nx, double dx2inv)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i > 0 && i < nx-1)
d_dvdt[i]=(d_y[i+1]+d_y[i-1]-2.0*d_y[i])*(dx2inv);
else
d_dvdt[i] = 0;
}
__global__ void pos_update(double * d_dvdt, double * d_y, double * d_v, double dt)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
d_v[i] = d_v[i] + dt*d_dvdt[i];
d_y[i] = d_y[i] + dt*d_v[i];
}
int main(int argc, char ** argv) {
int nx = 5000;
int nt = 1000000;
int i,it;
double x[nx];
double y[nx];
double v[nx];
double dvdt[nx];
double dt;
double dx;
double max,min;
double dx2inv;
double tmax;
double *d_x, *d_y, *d_v, *d_dvdt;
CUDA_CALL(cudaMalloc((void **)&d_x,nx*sizeof(double)));
CUDA_CALL(cudaMalloc((void **)&d_y,nx*sizeof(double)));
CUDA_CALL(cudaMalloc((void **)&d_v,nx*sizeof(double)));
CUDA_CALL(cudaMalloc((void **)&d_dvdt,nx*sizeof(double)));
max=10.0;
min=0.0;
dx = (max-min)/(double)(nx-1);
x[0] = min;
for(i=1;i<nx-1;i++) {
x[i] = min+(double)i*dx;
}
x[nx-1] = max;
tmax=10.0;
dt= (tmax-0.0)/(double)(nt-1);
for (i=0;i<nx;i++) {
y[i] = exp(-(x[i]-5.0)*(x[i]-5.0));
v[i] = 0.0;
dvdt[i] = 0.0;
}
CUDA_CALL(cudaMemcpy(d_x,x,nx*sizeof(double),cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(d_y,y,nx*sizeof(double),cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(d_v,v,nx*sizeof(double),cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(d_dvdt,dvdt,nx*sizeof(double),cudaMemcpyHostToDevice));
dx2inv=1.0/(dx*dx);
int block_size=1024;
int block_no = nx/block_size;
dim3 dimBlock(block_size,1,1);
dim3 dimGrid(block_no,1,1);
for(it=0;it<nt-1;it++) {
accel_update<<<dimGrid, dimBlock>>>(d_dvdt, d_y, nx, dx2inv);
pos_update<<<dimGrid, dimBlock>>>(d_dvdt, d_y, d_v, dt);
}
CUDA_CALL(cudaMemcpy(x,d_x,nx*sizeof(double),cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(y,d_y,nx*sizeof(double),cudaMemcpyDeviceToHost));
for(i=nx/2-10; i<nx/2+10; i++) {
printf("%g %g\n",x[i],y[i]);
}
return 0;
}
Overwriting wave_cuda.cu
!nvcc -std=c++11 -o wave_cuda wave_cuda.cu
nvcc: Command not found.
%%time
!./wave_cuda
./wave_cuda: Command not found. CPU times: user 1.74 ms, sys: 3.92 ms, total: 5.66 ms Wall time: 150 ms
If you attend class in-person then have one of the instructors check your notebook and sign you out before leaving class. If you are attending asynchronously, turn in your assignment using D2L.
Written by Dr. Dirk Colbry, Michigan State University
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.