2025-02-16

CEN310 Parallel Programming¶

Week-5¶

GPU Programming¶

Download

Outline (¼)¶

Introduction to GPU Computing
GPU Architecture Overview
CUDA Programming Model
GPU Memory Hierarchy
Thread Organization
Kernel Functions
CUDA Programming Basics
Memory Management
Thread Organization
Synchronization
Error Handling
CUDA Runtime API

Outline (2/4)¶

Memory Management
Global Memory
Shared Memory
Constant Memory
Texture Memory
Unified Memory
Thread Organization
Blocks and Grids
Warps and Scheduling
Thread Synchronization
Occupancy
Load Balancing

Outline (¾)¶

Performance Optimization
Memory Coalescing
Bank Conflicts
Divergent Branching
Shared Memory Usage
Asynchronous Operations
Advanced Features
Streams and Events
Dynamic Parallelism
Multi-GPU Programming
Unified Memory
Cooperative Groups

Outline (4/4)¶

Best Practices
Code Organization
Error Handling
Debugging Techniques
Profiling Tools
Common Pitfalls
Real-World Applications
Image Processing
Scientific Computing
Machine Learning
Data Analytics

1. Introduction to GPU Computing¶

GPU Architecture (¼)¶

CPU                     GPU
┌─────┐                ┌─────┐
│Core │                │SM   │
└─────┘                └─────┘
  │                      │
  │                    ┌─────┐
┌─────┐                │SM   │
│Cache│                └─────┘
└─────┘                  │
  │                    ┌─────┐
┌─────┐                │SM   │
│RAM  │                └─────┘
└─────┘                  │
                       ┌─────┐
                       │VRAM │
                       └─────┘

Key Components: - Streaming Multiprocessors (SMs) - CUDA Cores - Memory Hierarchy - Warp Schedulers

GPU Architecture (2/4)¶

Memory Hierarchy¶

// Example showing different memory types
__device__ __constant__ float device_constant[256];  // Constant memory
__shared__ float shared_array[256];                  // Shared memory

__global__ void memory_example(float* global_input,  // Global memory
                             float* global_output) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;

    // Register (automatic) variables
    float local_var = global_input[idx];

    // Shared memory usage
    shared_array[threadIdx.x] = local_var;
    __syncthreads();

    // Constant memory usage
    local_var *= device_constant[threadIdx.x];

    // Write back to global memory
    global_output[idx] = local_var;
}

GPU Architecture (¾)¶

Thread Hierarchy¶

__global__ void thread_hierarchy_example() {
    // Thread identification
    int thread_idx = threadIdx.x;
    int block_idx = blockIdx.x;
    int warp_id = threadIdx.x / 32;

    // Block dimensions
    int block_size = blockDim.x;
    int grid_size = gridDim.x;

    // Global thread ID
    int global_idx = thread_idx + block_idx * block_size;

    // Print thread information
    printf("Thread %d in block %d (warp %d)\n",
           thread_idx, block_idx, warp_id);
}

int main() {
    // Launch configuration
    dim3 block_size(256);
    dim3 grid_size((N + block_size.x - 1) / block_size.x);

    thread_hierarchy_example<<<grid_size, block_size>>>();
    return 0;
}

GPU Architecture (4/4)¶

Basic CUDA Program¶

#include <cuda_runtime.h>

// Kernel definition
__global__ void vector_add(float* a, float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

int main() {
    int N = 1000000;
    size_t size = N * sizeof(float);

    // Allocate host memory
    float *h_a = (float*)malloc(size);
    float *h_b = (float*)malloc(size);
    float *h_c = (float*)malloc(size);

    // Initialize arrays
    for(int i = 0; i < N; i++) {
        h_a[i] = rand() / (float)RAND_MAX;
        h_b[i] = rand() / (float)RAND_MAX;
    }

    // Allocate device memory
    float *d_a, *d_b, *d_c;
    cudaMalloc(&d_a, size);
    cudaMalloc(&d_b, size);
    cudaMalloc(&d_c, size);

    // Copy to device
    cudaMemcpy(d_a, h_a, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, size, cudaMemcpyHostToDevice);

    // Launch kernel
    int block_size = 256;
    int num_blocks = (N + block_size - 1) / block_size;
    vector_add<<<num_blocks, block_size>>>(d_a, d_b, d_c, N);

    // Copy result back
    cudaMemcpy(h_c, d_c, size, cudaMemcpyDeviceToHost);

    // Verify results
    for(int i = 0; i < N; i++) {
        if(fabs(h_c[i] - (h_a[i] + h_b[i])) > 1e-5) {
            fprintf(stderr, "Verification failed at %d\n", i);
            break;
        }
    }

    // Cleanup
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_a);
    free(h_b);
    free(h_c);

    return 0;
}

2. CUDA Programming Basics¶

Memory Management (¼)¶

void memory_management_example() {
    // Host memory allocation
    float* h_data = (float*)malloc(size);

    // Device memory allocation
    float* d_data;
    cudaMalloc(&d_data, size);

    // Pinned memory allocation
    float* h_pinned;
    cudaMallocHost(&h_pinned, size);

    // Unified memory allocation
    float* unified;
    cudaMallocManaged(&unified, size);

    // Memory transfers
    cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
    cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);

    // Asynchronous transfers
    cudaStream_t stream;
    cudaStreamCreate(&stream);
    cudaMemcpyAsync(d_data, h_pinned, size,
                    cudaMemcpyHostToDevice, stream);

    // Cleanup
    free(h_data);
    cudaFree(d_data);
    cudaFreeHost(h_pinned);
    cudaFree(unified);
    cudaStreamDestroy(stream);
}

Memory Management (2/4)¶

Shared Memory Usage¶

__global__ void shared_memory_example(float* input,
                                    float* output,
                                    int n) {
    extern __shared__ float shared[];

    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;

    // Load data into shared memory
    if(gid < n) {
        shared[tid] = input[gid];
    }
    __syncthreads();

    // Process data in shared memory
    if(tid > 0 && tid < blockDim.x-1 && gid < n-1) {
        float result = 0.25f * (shared[tid-1] + 
                               2.0f * shared[tid] +
                               shared[tid+1]);
        output[gid] = result;
    }
}

// Kernel launch
int block_size = 256;
int shared_size = block_size * sizeof(float);
shared_memory_example<<<grid_size, block_size, shared_size>>>
    (d_input, d_output, N);

Memory Management (¾)¶

Constant Memory¶

__constant__ float const_array[256];

void setup_constant_memory() {
    float h_const_array[256];

    // Initialize constant data
    for(int i = 0; i < 256; i++) {
        h_const_array[i] = compute_constant(i);
    }

    // Copy to constant memory
    cudaMemcpyToSymbol(const_array, h_const_array,
                       256 * sizeof(float));
}

__global__ void use_constant_memory(float* input,
                                  float* output,
                                  int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < n) {
        // Use constant memory
        output[idx] = input[idx] * const_array[idx % 256];
    }
}

Memory Management (4/4)¶

Texture Memory¶

texture<float, 2, cudaReadModeElementType> tex_ref;

void texture_memory_example() {
    // Allocate and initialize 2D array
    cudaArray* d_array;
    cudaChannelFormatDesc channel_desc = 
        cudaCreateChannelDesc<float>();

    cudaMallocArray(&d_array, &channel_desc,
                    width, height);

    // Copy data to array
    cudaMemcpyToArray(d_array, 0, 0, h_data,
                      width * height * sizeof(float),
                      cudaMemcpyHostToDevice);

    // Bind texture reference
    cudaBindTextureToArray(tex_ref, d_array);

    // Kernel using texture memory
    texture_kernel<<<grid_size, block_size>>>
        (d_output, width, height);

    // Cleanup
    cudaUnbindTexture(tex_ref);
    cudaFreeArray(d_array);
}

__global__ void texture_kernel(float* output,
                             int width, int height) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x < width && y < height) {
        // Read from texture
        float value = tex2D(tex_ref, x, y);
        output[y * width + x] = value;
    }
}

3. Thread Organization¶

Thread Hierarchy (¼)¶

__global__ void thread_organization_example() {
    // Thread indices
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int tz = threadIdx.z;

    // Block indices
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int bz = blockIdx.z;

    // Block dimensions
    int bdx = blockDim.x;
    int bdy = blockDim.y;
    int bdz = blockDim.z;

    // Grid dimensions
    int gdx = gridDim.x;
    int gdy = gridDim.y;
    int gdz = gridDim.z;

    // Calculate global indices
    int global_x = bx * bdx + tx;
    int global_y = by * bdy + ty;
    int global_z = bz * bdz + tz;

    // Calculate linear index
    int linear_idx = global_z * gdx * gdy * bdx * bdy +
                    global_y * gdx * bdx +
                    global_x;
}

Thread Hierarchy (2/4)¶

Block Configuration¶

void launch_configuration_example() {
    // 1D configuration
    dim3 block_1d(256);
    dim3 grid_1d((N + block_1d.x - 1) / block_1d.x);
    kernel_1d<<<grid_1d, block_1d>>>();

    // 2D configuration
    dim3 block_2d(16, 16);
    dim3 grid_2d((width + block_2d.x - 1) / block_2d.x,
                 (height + block_2d.y - 1) / block_2d.y);
    kernel_2d<<<grid_2d, block_2d>>>();

    // 3D configuration
    dim3 block_3d(8, 8, 8);
    dim3 grid_3d((width + block_3d.x - 1) / block_3d.x,
                 (height + block_3d.y - 1) / block_3d.y,
                 (depth + block_3d.z - 1) / block_3d.z);
    kernel_3d<<<grid_3d, block_3d>>>();
}

// Kernel examples
__global__ void kernel_1d() {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
}

__global__ void kernel_2d() {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
}

__global__ void kernel_3d() {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    int z = blockIdx.z * blockDim.z + threadIdx.z;
}

Thread Hierarchy (¾)¶

Warp Management¶

__global__ void warp_example() {
    int tid = threadIdx.x;
    int warp_id = tid / 32;
    int lane_id = tid % 32;

    // Warp-level primitives
    int mask = __ballot_sync(__activemask(), tid < 16);
    int value = __shfl_sync(__activemask(), tid, 0);

    // Warp-level reduction
    int sum = tid;
    for(int offset = 16; offset > 0; offset /= 2) {
        sum += __shfl_down_sync(__activemask(), sum, offset);
    }

    // Warp-level synchronization
    __syncwarp();
}

Thread Hierarchy (4/4)¶

Dynamic Parallelism¶

__global__ void child_kernel(int* data, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < size) {
        data[idx] *= 2;
    }
}

__global__ void parent_kernel(int* data,
                            int* sizes,
                            int num_arrays) {
    int array_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(array_idx < num_arrays) {
        int size = sizes[array_idx];
        int* array_data = &data[array_idx * MAX_ARRAY_SIZE];

        // Launch child kernel
        int block_size = 256;
        int grid_size = (size + block_size - 1) / block_size;
        child_kernel<<<grid_size, block_size>>>
            (array_data, size);
    }
}

4. Performance Optimization¶

Memory Coalescing (¼)¶

// Bad memory access pattern
__global__ void uncoalesced_access(float* input,
                                  float* output,
                                  int width,
                                  int height) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < width) {
        for(int y = 0; y < height; y++) {
            output[idx + y * width] = 
                input[idx + y * width];  // Strided access
        }
    }
}

// Good memory access pattern
__global__ void coalesced_access(float* input,
                                float* output,
                                int width,
                                int height) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(idx < width && y < height) {
        output[y * width + idx] = 
            input[y * width + idx];  // Coalesced access
    }
}

Memory Coalescing (2/4)¶

Bank Conflicts¶

__global__ void bank_conflicts_example(float* data) {
    extern __shared__ float shared[];
    int tid = threadIdx.x;

    // Bad: Bank conflicts
    shared[tid * 32] = data[tid];  // 32-way bank conflict

    // Good: No bank conflicts
    shared[tid] = data[tid];       // Consecutive access

    __syncthreads();

    // Process data
    float result = shared[tid];
    // ...
}

Memory Coalescing (¾)¶

Shared Memory Optimization¶

template<int BLOCK_SIZE>
__global__ void matrix_multiply(float* A,
                              float* B,
                              float* C,
                              int width) {
    __shared__ float shared_A[BLOCK_SIZE][BLOCK_SIZE];
    __shared__ float shared_B[BLOCK_SIZE][BLOCK_SIZE];

    int tx = threadIdx.x;
    int ty = threadIdx.y;
    int bx = blockIdx.x;
    int by = blockIdx.y;

    int row = by * BLOCK_SIZE + ty;
    int col = bx * BLOCK_SIZE + tx;

    float sum = 0.0f;

    // Loop over blocks
    for(int block = 0; block < width/BLOCK_SIZE; block++) {
        // Load data into shared memory
        shared_A[ty][tx] = A[row * width + 
                            block * BLOCK_SIZE + tx];
        shared_B[ty][tx] = B[(block * BLOCK_SIZE + ty) * width + 
                            col];
        __syncthreads();

        // Compute partial dot product
        for(int k = 0; k < BLOCK_SIZE; k++) {
            sum += shared_A[ty][k] * shared_B[k][tx];
        }
        __syncthreads();
    }

    // Store result
    C[row * width + col] = sum;
}

Memory Coalescing (4/4)¶

Memory Access Patterns¶

// Structure of Arrays (SoA)
struct ParticlesSoA {
    float* x;
    float* y;
    float* z;
    float* vx;
    float* vy;
    float* vz;
};

// Array of Structures (AoS)
struct ParticleAoS {
    float x, y, z;
    float vx, vy, vz;
};

// SoA kernel (better coalescing)
__global__ void update_particles_soa(ParticlesSoA particles,
                                   int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < n) {
        particles.x[idx] += particles.vx[idx];
        particles.y[idx] += particles.vy[idx];
        particles.z[idx] += particles.vz[idx];
    }
}

// AoS kernel (worse coalescing)
__global__ void update_particles_aos(ParticleAoS* particles,
                                   int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < n) {
        particles[idx].x += particles[idx].vx;
        particles[idx].y += particles[idx].vy;
        particles[idx].z += particles[idx].vz;
    }
}

5. Advanced Features¶

Streams and Events (⅓)¶

void stream_example() {
    const int num_streams = 4;
    cudaStream_t streams[num_streams];

    // Create streams
    for(int i = 0; i < num_streams; i++) {
        cudaStreamCreate(&streams[i]);
    }

    // Allocate memory
    float *h_input, *d_input, *h_output, *d_output;
    cudaMallocHost(&h_input, size);    // Pinned memory
    cudaMallocHost(&h_output, size);   // Pinned memory
    cudaMalloc(&d_input, size);
    cudaMalloc(&d_output, size);

    // Launch kernels in different streams
    int chunk_size = N / num_streams;
    for(int i = 0; i < num_streams; i++) {
        int offset = i * chunk_size;

        cudaMemcpyAsync(&d_input[offset],
                        &h_input[offset],
                        chunk_size * sizeof(float),
                        cudaMemcpyHostToDevice,
                        streams[i]);

        process_kernel<<<grid_size, block_size, 0, streams[i]>>>
            (&d_input[offset], &d_output[offset], chunk_size);

        cudaMemcpyAsync(&h_output[offset],
                        &d_output[offset],
                        chunk_size * sizeof(float),
                        cudaMemcpyDeviceToHost,
                        streams[i]);
    }

    // Synchronize all streams
    cudaDeviceSynchronize();

    // Cleanup
    for(int i = 0; i < num_streams; i++) {
        cudaStreamDestroy(streams[i]);
    }
    cudaFreeHost(h_input);
    cudaFreeHost(h_output);
    cudaFree(d_input);
    cudaFree(d_output);
}

Streams and Events (⅔)¶

Event Management¶

void event_example() {
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    // Record start event
    cudaEventRecord(start);

    // Launch kernel
    process_kernel<<<grid_size, block_size>>>(d_data, N);

    // Record stop event
    cudaEventRecord(stop);

    // Wait for completion
    cudaEventSynchronize(stop);

    // Calculate elapsed time
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    printf("Kernel execution time: %f ms\n", milliseconds);

    // Cleanup
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
}

Streams and Events (3/3)¶

Inter-stream Synchronization¶

void stream_synchronization() {
    cudaStream_t stream1, stream2;
    cudaStreamCreate(&stream1);
    cudaStreamCreate(&stream2);

    cudaEvent_t event;
    cudaEventCreate(&event);

    // Launch work in stream1
    kernel1<<<grid_size, block_size, 0, stream1>>>
        (d_data1, N);
    cudaEventRecord(event, stream1);

    // Make stream2 wait for stream1
    cudaStreamWaitEvent(stream2, event);

    // Launch work in stream2
    kernel2<<<grid_size, block_size, 0, stream2>>>
        (d_data2, N);

    // Cleanup
    cudaEventDestroy(event);
    cudaStreamDestroy(stream1);
    cudaStreamDestroy(stream2);
}

6. Best Practices¶

Error Handling (⅓)¶

#define CUDA_CHECK(call) do {                              \
    cudaError_t error = call;                             \
    if(error != cudaSuccess) {                            \
        fprintf(stderr, "CUDA error at %s:%d: %s\n",      \
                __FILE__, __LINE__,                        \
                cudaGetErrorString(error));                \
        exit(EXIT_FAILURE);                               \
    }                                                      \
} while(0)

void cuda_error_handling() {
    // Allocate memory
    float* d_data;
    CUDA_CHECK(cudaMalloc(&d_data, size));

    // Launch kernel
    process_kernel<<<grid_size, block_size>>>(d_data, N);
    CUDA_CHECK(cudaGetLastError());

    // Synchronize and check for errors
    CUDA_CHECK(cudaDeviceSynchronize());

    // Cleanup
    CUDA_CHECK(cudaFree(d_data));
}

Error Handling (⅔)¶

Debug Tools¶

void debug_example() {
    // Enable device synchronization for debugging
    cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, 1);

    // Print device properties
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    printf("Device: %s\n", prop.name);
    printf("Compute capability: %d.%d\n",
           prop.major, prop.minor);

    // Launch kernel with debug info
    #ifdef DEBUG
        printf("Launching kernel with grid=%d, block=%d\n",
               grid_size.x, block_size.x);
    #endif

    process_kernel<<<grid_size, block_size>>>(d_data, N);

    // Check for kernel errors
    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess) {
        printf("Kernel error: %s\n",
               cudaGetErrorString(error));
    }
}

Error Handling (3/3)¶

Resource Management¶

class CUDAResource {
private:
    void* ptr;
    size_t size;

public:
    CUDAResource(size_t s) : size(s), ptr(nullptr) {
        CUDA_CHECK(cudaMalloc(&ptr, size));
    }

    ~CUDAResource() {
        if(ptr) {
            cudaFree(ptr);
        }
    }

    void* get() { return ptr; }
    size_t get_size() { return size; }

    // Prevent copying
    CUDAResource(const CUDAResource&) = delete;
    CUDAResource& operator=(const CUDAResource&) = delete;
};

void resource_management_example() {
    try {
        CUDAResource d_input(1024);
        CUDAResource d_output(1024);

        // Use resources
        process_kernel<<<grid_size, block_size>>>
            (d_input.get(), d_output.get(), N);
    }
    catch(const std::exception& e) {
        fprintf(stderr, "Error: %s\n", e.what());
    }
}

7. Real-World Applications¶

Image Processing (⅓)¶

```cpp // Image convolution kernel global void convolution_2d(unsigned char* input,

Contributors:

Ugur Coruh (100.0%)