ICA 22: CUDA Memory and Tiling

ICA 22: CUDA Memory and Tiling#

picture of a tile mosaic designed to motivate this topic.

Image from: https://www.appianimosaic.com/

Agenda for today’s class (70 minutes)#

(20 minutes) Pre class Review
(20 minutes) Tile example
(30 minutes) 1D wave Cuda Code Optimization

1. Pre class Review#

Discuss the pre-class with your group and work through any confusion or challenges.

2. Tile example#

This video does a nice job of explaining this if the code is not intuitive: https://www.youtube.com/watch?v=Q3GgbfGTnVc

%%writefile matrix_multiply.cu

#include <stdio.h>
#include <cuda.h>
#include <chrono>
#include <cmath>

#define N 4096  // Matrix size (N x N)
#define TILE_SIZE 32  // Tile size for shared memory optimization
#define EPSILON 1e-4  // Error tolerance for comparison

// matrix multiplication without tiling
__global__ void matMul(float *A, float *B, float *C, int n) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < n && col < n) {
        float sum = 0.0;
        for (int k = 0; k < n; k++) {
            sum += A[row * n + k] * B[k * n + col];
        }
        C[row * n + col] = sum;
    }
}

// matrix multiplication with tiling
__global__ void matMulTiled(float *A, float *B, float *C, int n) {
    __shared__ float tileA[TILE_SIZE][TILE_SIZE];
    __shared__ float tileB[TILE_SIZE][TILE_SIZE];

    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0;

    for (int i = 0; i < n / TILE_SIZE; i++) {
        tileA[threadIdx.y][threadIdx.x] = A[row * n + (i * TILE_SIZE + threadIdx.x)];
        tileB[threadIdx.y][threadIdx.x] = B[(i * TILE_SIZE + threadIdx.y) * n + col];
        __syncthreads();

        for (int k = 0; k < TILE_SIZE; k++) {
            sum += tileA[threadIdx.y][k] * tileB[k][threadIdx.x];
        }
        __syncthreads();
    }

    if (row < n && col < n)
        C[row * n + col] = sum;
}

void initializeMatrix(float *mat, int n) {
    for (int i = 0; i < n * n; i++)
        mat[i] = static_cast<float>(rand()) / RAND_MAX;
}

bool compareMatrices(float *A, float *B, int n, float epsilon) {
    for (int i = 0; i < n * n; i++) {
        if (fabs(A[i] - B[i]) > epsilon) {
            printf("Mismatch at index %d: A = %f, B = %f\n", i, A[i], B[i]);
            return false;
        }
    }
    return true;
}

float timeKernel(void (*kernel)(float *, float *, float *, int), float *A, float *B, float *C, int n, dim3 gridDim, dim3 blockDim) {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    kernel<<<gridDim, blockDim>>>(A, B, C, n);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    return milliseconds;
}

int main() {
    size_t bytes = N * N * sizeof(float);
    float *A, *B, *C, *C_tiled;
    float *d_A, *d_B, *d_C, *d_C_tiled;

    A = (float *)malloc(bytes);
    B = (float *)malloc(bytes);
    C = (float *)malloc(bytes);
    C_tiled = (float *)malloc(bytes);

    initializeMatrix(A, N);
    initializeMatrix(B, N);

    cudaMalloc(&d_A, bytes);
    cudaMalloc(&d_B, bytes);
    cudaMalloc(&d_C, bytes);
    cudaMalloc(&d_C_tiled, bytes);

    cudaMemcpy(d_A, A, bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, B, bytes, cudaMemcpyHostToDevice);

    dim3 blockDim(TILE_SIZE, TILE_SIZE);
    dim3 gridDim(N / TILE_SIZE, N / TILE_SIZE);

    float tiledTime = timeKernel(matMulTiled, d_A, d_B, d_C, N, gridDim, blockDim);
    float Time = timeKernel(matMul, d_A, d_B, d_C_tiled, N, gridDim, blockDim);

    cudaMemcpy(C, d_C, bytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(C_tiled, d_C_tiled, bytes, cudaMemcpyDeviceToHost);

    printf("Matrix Multiplication Time: %.2f ms\n", Time);
    printf("Tiled Matrix Multiplication Time: %.2f ms\n", tiledTime);
    printf("Speedup: %.2fx\n", Time / tiledTime);

    // Compare the results
    if (compareMatrices(C, C_tiled, N, EPSILON)) {
        printf("SUCCESS: The matrices match!\n");
    } else {
        printf("ERROR: The matrices do not match!\n");
    }
    printf("%f\n", C[1]);
    printf("%f\n", C_tiled[1]);
    free(A); free(B); free(C); free(C_tiled);
    cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaFree(d_C_tiled);

    return 0;
}

3. 1D wave Cuda Code Optimization#

See if you can update the wave equation code to use tiling.

%%writefile wave_cuda.cu

#include <stdio.h>
#include <stdlib.h>
#include <math.h>

__global__ void update_dvdt(double* dvdt, double* y, double dx2inv, int nx){
	int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
	if (i >= nx-1) return;
	dvdt[i] = (y[i+1] +y[i-1] - 2.0*y[i])*(dx2inv);
}

__global__ void update_vy(double* v, double* y, double dt, double* dvdt, int nx){
	int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
	if (i >= nx-1) return;
	v[i] = v[i] + dt*dvdt[i];
	y[i] = y[i] + dt*v[i];
}


int main(int argc, char ** argv) {
    int nx = 500;
    int nt = 100000;
    int i,it;
    double x[nx];
    double y[nx];
    double v[nx];
    double dvdt[nx];
    double dt;
    double dx;
    double max,min;
    double dx2inv;
    double tmax;
    int nxm1;
    double *c_y, *c_v, *c_dvdt;

    cudaMalloc((void**)&c_y, nx*sizeof(double));
    cudaMalloc((void**)&c_v, nx*sizeof(double));
    cudaMalloc((void**)&c_dvdt, nx*sizeof(double));    


    max=10.0;
    min=0.0;
    dx = (max-min)/(double)(nx);
    x[0] = min;
    for(i=1;i<nx-1;i++) {
        x[i] = min+(double)i*dx;
    }
    x[nx-1] = max;
    tmax=10.0;
    dt= (tmax-0.0)/(double)(nt);

    for (i=0;i<nx;i++)  {
        y[i] = exp(-(x[i]-5.0)*(x[i]-5.0));
        v[i] = 0.0;
        dvdt[i] = 0.0;
    }
    
    dx2inv=1.0/(dx*dx);
    nxm1=nx-1;

    cudaMemcpy(c_y, y, nx*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(c_v, v, nx*sizeof(double), cudaMemcpyHostToDevice);
    cudaMemcpy(c_dvdt, dvdt, nx*sizeof(double), cudaMemcpyHostToDevice);
    int threads = 500;
    int blocks = 10;
    for(it=0;it<nt-1;it++) {
        //for(i=1;i<nxm1;i++)
        //    dvdt[i]=(y[i+1]+y[i-1]-2.0*y[i])*(dx2inv);
	update_dvdt<<<blocks, threads>>>(c_dvdt, c_y, dx2inv, nx);
        //for(i=1; i<nxm1; i++)  {
        //    v[i] = v[i] + dt*dvdt[i];
        //    y[i] = y[i] + dt*v[i];
        //}
	update_vy<<<blocks, threads>>>(c_v, c_y, dt, c_dvdt, nx);
    }
    cudaMemcpy(y, c_y, nx*sizeof(double), cudaMemcpyDeviceToHost);
    for(i=nx/2; i<nx/2+10; i++) {
        printf("%f, ", y[i]);
	//printf("%g %g\n",x[i],y[i]);
    }

    return 0;
}

Congratulations, we’re done!#

Have one of the instructors check your notebook and sign you out before leaving class. Turn in your assignment using D2L.

Written by Dr. Dirk Colbry, Michigan State University (Updated by Dr. Nathan Haut in Spring 2025)
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.