%%writefile NCode/tiled_transpose.cu

#include <iostream>
#include <cuda.h>
#include <chrono>
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) { fprintf(stderr, "CUDA error: " #x " returned \"%s\"\n", cudaGetErrorString(cuda_error__)); fflush(stderr); exit(cuda_error__); } }

using namespace std;

const int BLOCKDIM = 32; 

__global__ void transpose(const double *in_d, double * out_d, int row, int col)
{
   int x = blockIdx.x * blockDim.x + threadIdx.x;
   int y = blockIdx.y * blockDim.y + threadIdx.y;
   if (x < col && y < row) 
       out_d[y+col*x] = in_d[x+row*y];
}

__global__ void tiled_transpose(const double *in_d, double * out_d, int row, int col)
{
   int x = blockIdx.x * BLOCKDIM + threadIdx.x;
   int y = blockIdx.y * BLOCKDIM + threadIdx.y;
    
   int x2 = blockIdx.y * BLOCKDIM + threadIdx.x;
   int y2 = blockIdx.x * BLOCKDIM + threadIdx.y;
    
   __shared__ double in_local[BLOCKDIM][BLOCKDIM];
   __shared__ double out_local[BLOCKDIM][BLOCKDIM];

   if (x < col && y < row) {
       in_local[threadIdx.x][threadIdx.y] = in_d[x+row*y];
       __syncthreads();

       out_local[threadIdx.y][threadIdx.x] = in_local[threadIdx.x][threadIdx.y];
       __syncthreads();

       out_d[x2+col*y2] = out_local[threadIdx.x][threadIdx.y];
   }
}

__global__ void transpose_symmetric(double *in_d, double * out_d, int row, int col)
{
   int x = blockIdx.x * blockDim.x + threadIdx.x;
   int y = blockIdx.y * blockDim.y + threadIdx.y;
   if (x < col && y < row) {
       if (x < y) { 
           double temp =  in_d[y+col*x];
           in_d[y+col*x] = in_d[x+row*y];
           in_d[x+row*y] = temp;
       }
   }
}



int main(int argc,char **argv)
{
    std::cout << "Begin\n";
   int sz_x=BLOCKDIM*300;
   int sz_y=BLOCKDIM*300;
   int nBytes = sz_x*sz_y*sizeof(double);
   int block_size = BLOCKDIM;
   double *m_h = (double *)malloc(nBytes);
   double * in_d;
   double * out_d;
   int count = 0;
   for (int i=0; i < sz_x*sz_y; i++){
       m_h[i] = count;
       count++;
   }
   std::cout << "Allocating device memory on host..\n";
   CUDA_CALL(cudaMalloc((void **)&in_d,nBytes));
   CUDA_CALL(cudaMalloc((void **)&out_d,nBytes));

   //Set up blocks
   dim3 dimBlock(block_size,block_size,1);
   dim3 dimGrid(sz_x/block_size,sz_y/block_size,1);

   std::cout << "Doing GPU Transpose\n";
   CUDA_CALL(cudaMemcpy(in_d,m_h,nBytes,cudaMemcpyHostToDevice));
    
   auto start_d = std::chrono::high_resolution_clock::now();
   
    /**********************
   //transpose<<<dimGrid,dimBlock>>>(in_d,out_d,sz_y,sz_x);
   tiled_transpose<<<dimGrid,dimBlock>>>(in_d,out_d,sz_y,sz_x);

   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
        fprintf(stderr, "\n\nError: %s\n\n", cudaGetErrorString(err)); fflush(stderr); exit(err);   
   } 
   CUDA_CALL(cudaMemcpy(m_h,out_d,nBytes,cudaMemcpyDeviceToHost));
   ************************/
    
   /**********************/
   transpose_symmetric<<<dimGrid,dimBlock>>>(in_d,out_d,sz_y,sz_x);    
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {
        fprintf(stderr, "\n\nError: %s\n\n", cudaGetErrorString(err)); fflush(stderr); exit(err);   
   } 
   CUDA_CALL(cudaMemcpy(m_h,in_d,nBytes,cudaMemcpyDeviceToHost));
   /************************/
    
   auto end_d = std::chrono::high_resolution_clock::now();

   std::cout << "Doing CPU Transpose\n";
   auto start_h = std::chrono::high_resolution_clock::now();
   for (int y=0; y < sz_y; y++){
        for (int x=y; x < sz_x; x++){
           double temp = m_h[x+sz_x*y];
           //std::cout << temp << " ";
           m_h[x+sz_x*y] = m_h[y+sz_y*x];
           m_h[y+sz_y*x] = temp;
       }
       //std::cout << "\n";
   }
   auto end_h = std::chrono::high_resolution_clock::now();


   //Checking errors (should be same values as start)
   count = 0;
   int errors = 0;
   for (int i=0; i < sz_x*sz_y; i++){
       if (m_h[i] != count)
           errors++;
       count++;
   }
   std::cout << errors << " Errors found in transpose\n";

    //Print Timing
   std::chrono::duration<double> time_d = end_d - start_d;
   std::cout << "Device time: " << time_d.count() << " s\n";
   std::chrono::duration<double> time_h = end_h - start_h;
   std::cout << "Host time: " << time_h.count() << " s\n";

   cudaFree(in_d);
   cudaFree(out_d);
   return 0;
}

Overwriting NCode/tiled_transpose.cu


#Compile Cuda
!nvcc -std=c++11 -o tiled_transpose NCode/tiled_transpose.cu

nvcc: Command not found.


#Run Example
!./tiled_transpose

./tiled_transpose: Command not found.


%%writefile wave_cuda.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#define CUDA_CALL(x) {cudaError_t cuda_error__ = (x); if (cuda_error__) printf("CUDA error: " #x " returned \"%s\"\n", cudaGetErrorString(cuda_error__));}


__global__ void accel_update(double* d_dvdt, double* d_y, int nx, double dx2inv)
{
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        if (i > 0 && i < nx-1)
            d_dvdt[i]=(d_y[i+1]+d_y[i-1]-2.0*d_y[i])*(dx2inv);
        else
            d_dvdt[i] = 0;
}

__global__ void pos_update(double * d_dvdt, double * d_y, double * d_v, double dt)
{
        int i = blockDim.x * blockIdx.x + threadIdx.x;
        d_v[i] = d_v[i] + dt*d_dvdt[i];
        d_y[i]  = d_y[i] + dt*d_v[i];
}


int main(int argc, char ** argv) {
    int nx = 5000;
    int nt = 1000000;
    int i,it;
    double x[nx];
    double y[nx];
    double v[nx];
    double dvdt[nx];
    double dt;
    double dx;
    double max,min;
    double dx2inv;
    double tmax;

    double *d_x, *d_y, *d_v, *d_dvdt;
    CUDA_CALL(cudaMalloc((void **)&d_x,nx*sizeof(double)));
    CUDA_CALL(cudaMalloc((void **)&d_y,nx*sizeof(double)));
    CUDA_CALL(cudaMalloc((void **)&d_v,nx*sizeof(double)));
    CUDA_CALL(cudaMalloc((void **)&d_dvdt,nx*sizeof(double)));

    max=10.0;
    min=0.0;
    dx = (max-min)/(double)(nx-1);
    x[0] = min;
    for(i=1;i<nx-1;i++) {
        x[i] = min+(double)i*dx;
    }

    x[nx-1] = max;
    tmax=10.0;
    dt= (tmax-0.0)/(double)(nt-1);

    for (i=0;i<nx;i++)  {
        y[i] = exp(-(x[i]-5.0)*(x[i]-5.0));
        v[i] = 0.0;
        dvdt[i] = 0.0;
    }

   CUDA_CALL(cudaMemcpy(d_x,x,nx*sizeof(double),cudaMemcpyHostToDevice));
   CUDA_CALL(cudaMemcpy(d_y,y,nx*sizeof(double),cudaMemcpyHostToDevice));
   CUDA_CALL(cudaMemcpy(d_v,v,nx*sizeof(double),cudaMemcpyHostToDevice));
   CUDA_CALL(cudaMemcpy(d_dvdt,dvdt,nx*sizeof(double),cudaMemcpyHostToDevice));

   dx2inv=1.0/(dx*dx);
   int block_size=1024;
   int block_no = nx/block_size;
   dim3 dimBlock(block_size,1,1);
   dim3 dimGrid(block_no,1,1);

    for(it=0;it<nt-1;it++) {
        accel_update<<<dimGrid, dimBlock>>>(d_dvdt, d_y, nx, dx2inv);
        pos_update<<<dimGrid, dimBlock>>>(d_dvdt, d_y, d_v, dt);
    }

   CUDA_CALL(cudaMemcpy(x,d_x,nx*sizeof(double),cudaMemcpyDeviceToHost));
   CUDA_CALL(cudaMemcpy(y,d_y,nx*sizeof(double),cudaMemcpyDeviceToHost));

    for(i=nx/2-10; i<nx/2+10; i++) {
        printf("%g %g\n",x[i],y[i]);
    }

    return 0;
}

Overwriting wave_cuda.cu


!nvcc -std=c++11 -o wave_cuda wave_cuda.cu

nvcc: Command not found.


%%time
!./wave_cuda

./wave_cuda: Command not found.
CPU times: user 1.74 ms, sys: 3.92 ms, total: 5.66 ms
Wall time: 150 ms

In-Class Assignment: CUDA Memory and Tiling¶

Agenda for today's class (70 minutes)¶

1. Pre class Review¶

2. Tile example¶

3. 1D wave Cuda Code Optimization¶

Congratulations, we're done!¶

Course Resources:¶