2025-02-16

CEN310 Parallel Programming¶

Week-12 (Real-world Applications I)¶

Spring Semester, 2024-2025¶

Overview¶

Topics¶

Scientific Computing Applications
Data Processing Applications
Performance Optimization
Case Studies

Objectives¶

Apply parallel programming to real problems
Optimize scientific computations
Process large datasets efficiently
Analyze real-world performance

1. Scientific Computing Applications¶

N-Body Simulation¶

__global__ void calculate_forces(float4* pos, float4* vel, float4* forces, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        float4 my_pos = pos[idx];
        float4 force = make_float4(0.0f, 0.0f, 0.0f, 0.0f);

        for(int j = 0; j < n; j++) {
            if(j != idx) {
                float4 other_pos = pos[j];
                float3 r = make_float3(
                    other_pos.x - my_pos.x,
                    other_pos.y - my_pos.y,
                    other_pos.z - my_pos.z
                );
                float dist = sqrtf(r.x*r.x + r.y*r.y + r.z*r.z);
                float f = (G * my_pos.w * other_pos.w) / (dist * dist);
                force.x += f * r.x/dist;
                force.y += f * r.y/dist;
                force.z += f * r.z/dist;
            }
        }
        forces[idx] = force;
    }
}

2. Data Processing Applications¶

Image Processing¶

__global__ void gaussian_blur(
    unsigned char* input,
    unsigned char* output,
    int width,
    int height,
    float* kernel,
    int kernel_size
) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x < width && y < height) {
        float sum = 0.0f;
        int k_radius = kernel_size / 2;

        for(int ky = -k_radius; ky <= k_radius; ky++) {
            for(int kx = -k_radius; kx <= k_radius; kx++) {
                int px = min(max(x + kx, 0), width - 1);
                int py = min(max(y + ky, 0), height - 1);
                float kernel_val = kernel[(ky+k_radius)*kernel_size + (kx+k_radius)];
                sum += input[py*width + px] * kernel_val;
            }
        }

        output[y*width + x] = (unsigned char)sum;
    }
}

3. Performance Optimization¶

Memory Access Optimization¶

// Optimize matrix transpose
__global__ void matrix_transpose(float* input, float* output, int width, int height) {
    __shared__ float tile[BLOCK_SIZE][BLOCK_SIZE+1]; // Avoid bank conflicts

    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x < width && y < height) {
        // Load into shared memory
        tile[threadIdx.y][threadIdx.x] = input[y*width + x];
        __syncthreads();

        // Calculate transposed indices
        int new_x = blockIdx.y * blockDim.y + threadIdx.x;
        int new_y = blockIdx.x * blockDim.x + threadIdx.y;

        if(new_x < height && new_y < width) {
            output[new_y*height + new_x] = tile[threadIdx.x][threadIdx.y];
        }
    }
}

4. Case Studies¶

Monte Carlo Simulation¶

__global__ void monte_carlo_pi(float* points_x, float* points_y, int* inside_circle, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if(idx < n) {
        float x = points_x[idx];
        float y = points_y[idx];
        float dist = x*x + y*y;

        if(dist <= 1.0f) {
            atomicAdd(inside_circle, 1);
        }
    }
}

int main() {
    int n = 1000000;
    float *h_x, *h_y, *d_x, *d_y;
    int *h_inside, *d_inside;

    // Allocate and initialize memory
    // ... (memory allocation code)

    // Generate random points
    for(int i = 0; i < n; i++) {
        h_x[i] = (float)rand()/RAND_MAX;
        h_y[i] = (float)rand()/RAND_MAX;
    }

    // Copy data to device and run kernel
    // ... (CUDA memory operations and kernel launch)

    // Calculate pi
    float pi = 4.0f * (*h_inside) / (float)n;
    printf("Estimated Pi: %f\n", pi);

    // Cleanup
    // ... (memory deallocation code)

    return 0;
}

Lab Exercise¶

Tasks¶

Implement N-body simulation
Optimize image processing kernel
Develop Monte Carlo simulation
Compare performance with CPU versions

Performance Analysis¶

Execution time
Memory bandwidth
GPU utilization
Scaling behavior

Resources¶

Documentation¶

CUDA Sample Applications
Scientific Computing Libraries
Performance Analysis Tools

Tools¶

NVIDIA Visual Profiler
Parallel Computing Toolbox
Performance Libraries

Questions & Discussion¶

Contributors:

Ugur Coruh (100.0%)