2025-02-16

CEN310 Parallel Programming¶

Week-13 (Real-world Applications II)¶

Spring Semester, 2024-2025¶

Overview¶

Topics¶

Advanced Parallel Patterns
N-body Simulations
Matrix Computations
Big Data Processing

Objectives¶

Implement complex parallel patterns
Optimize scientific simulations
Perform large-scale matrix operations
Process big data efficiently

1. Advanced Parallel Patterns¶

Pipeline Pattern¶

template<typename T>
class ParallelPipeline {
private:
    std::vector<std::thread> stages;
    std::vector<std::queue<T>> queues;
    std::vector<std::mutex> mutexes;
    std::vector<std::condition_variable> cvs;
    bool running;

public:
    ParallelPipeline(int num_stages) {
        queues.resize(num_stages - 1);
        mutexes.resize(num_stages - 1);
        cvs.resize(num_stages - 1);
        running = true;
    }

    void add_stage(std::function<void(T&)> stage_func, int stage_id) {
        stages.emplace_back([this, stage_func, stage_id]() {
            while(running) {
                T data;
                if(stage_id == 0) {
                    // First stage: produce data
                    data = produce_data();
                } else {
                    // Get data from previous stage
                    std::unique_lock<std::mutex> lock(mutexes[stage_id-1]);
                    cvs[stage_id-1].wait(lock, 
                        [this, stage_id]() { 
                            return !queues[stage_id-1].empty() || !running; 
                        });
                    if(!running) break;
                    data = queues[stage_id-1].front();
                    queues[stage_id-1].pop();
                    lock.unlock();
                    cvs[stage_id-1].notify_one();
                }

                // Process data
                stage_func(data);

                if(stage_id < stages.size() - 1) {
                    // Pass to next stage
                    std::unique_lock<std::mutex> lock(mutexes[stage_id]);
                    queues[stage_id].push(data);
                    lock.unlock();
                    cvs[stage_id].notify_one();
                }
            }
        });
    }

    void start() {
        for(auto& stage : stages) {
            stage.join();
        }
    }

    void stop() {
        running = false;
        for(auto& cv : cvs) {
            cv.notify_all();
        }
    }
};

2. N-body Simulations¶

Barnes-Hut Algorithm¶

struct Octree {
    struct Node {
        vec3 center;
        float size;
        float mass;
        vec3 com;
        std::vector<Node*> children;
    };

    Node* root;
    float theta;

    __device__ void compute_force(vec3& pos, vec3& force, Node* node) {
        vec3 diff = node->com - pos;
        float dist = length(diff);

        if(node->size / dist < theta || node->children.empty()) {
            // Use approximation
            float f = G * node->mass / (dist * dist * dist);
            force += diff * f;
        } else {
            // Recurse into children
            for(auto child : node->children) {
                if(child != nullptr) {
                    compute_force(pos, force, child);
                }
            }
        }
    }

    __global__ void update_bodies(vec3* pos, vec3* vel, vec3* acc, 
                                float dt, int n) {
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
        if(idx < n) {
            vec3 force(0.0f);
            compute_force(pos[idx], force, root);
            acc[idx] = force;
            vel[idx] += acc[idx] * dt;
            pos[idx] += vel[idx] * dt;
        }
    }
};

3. Matrix Computations¶

Parallel Matrix Factorization¶

__global__ void lu_factorization(float* A, int n, int k) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;

    if(row > k && row < n && col > k && col < n) {
        A[row * n + col] -= A[row * n + k] * A[k * n + col] / A[k * n + k];
    }
}

void parallel_lu(float* A, int n) {
    dim3 block(16, 16);
    dim3 grid((n + block.x - 1) / block.x, 
              (n + block.y - 1) / block.y);

    for(int k = 0; k < n-1; k++) {
        lu_factorization<<<grid, block>>>(A, n, k);
        cudaDeviceSynchronize();
    }
}

4. Big Data Processing¶

Parallel Data Analysis¶

template<typename T>
class ParallelDataProcessor {
private:
    std::vector<T> data;
    int num_threads;

public:
    ParallelDataProcessor(const std::vector<T>& input, int threads) 
        : data(input), num_threads(threads) {}

    template<typename Func>
    std::vector<T> map(Func f) {
        std::vector<T> result(data.size());
        #pragma omp parallel for num_threads(num_threads)
        for(size_t i = 0; i < data.size(); i++) {
            result[i] = f(data[i]);
        }
        return result;
    }

    template<typename Func>
    T reduce(Func f, T initial) {
        T result = initial;
        #pragma omp parallel num_threads(num_threads)
        {
            T local_sum = initial;
            #pragma omp for nowait
            for(size_t i = 0; i < data.size(); i++) {
                local_sum = f(local_sum, data[i]);
            }
            #pragma omp critical
            {
                result = f(result, local_sum);
            }
        }
        return result;
    }
};

Lab Exercise¶

Tasks¶

Implement Barnes-Hut simulation
Develop parallel LU factorization
Create big data processing pipeline
Analyze performance characteristics

Performance Analysis¶

Algorithm complexity
Memory access patterns
Load balancing
Scalability testing

Resources¶

Documentation¶

Advanced CUDA Programming Guide
Parallel Algorithms Reference
Scientific Computing Libraries

Tools¶

Performance Profilers
Debugging Tools
Analysis Frameworks

Questions & Discussion¶

Contributors:

Ugur Coruh (100.0%)