CEN310 Parallel Programming
Week-13 (Real-world Applications II)
Spring Semester, 2024-2025
Overview
Topics
- Advanced Parallel Patterns
- N-body Simulations
- Matrix Computations
- Big Data Processing
Objectives
- Implement complex parallel patterns
- Optimize scientific simulations
- Perform large-scale matrix operations
- Process big data efficiently
1. Advanced Parallel Patterns
Pipeline Pattern
template<typename T>
class ParallelPipeline {
private:
std::vector<std::thread> stages;
std::vector<std::queue<T>> queues;
std::vector<std::mutex> mutexes;
std::vector<std::condition_variable> cvs;
bool running;
public:
ParallelPipeline(int num_stages) {
queues.resize(num_stages - 1);
mutexes.resize(num_stages - 1);
cvs.resize(num_stages - 1);
running = true;
}
void add_stage(std::function<void(T&)> stage_func, int stage_id) {
stages.emplace_back([this, stage_func, stage_id]() {
while(running) {
T data;
if(stage_id == 0) {
// First stage: produce data
data = produce_data();
} else {
// Get data from previous stage
std::unique_lock<std::mutex> lock(mutexes[stage_id-1]);
cvs[stage_id-1].wait(lock,
[this, stage_id]() {
return !queues[stage_id-1].empty() || !running;
});
if(!running) break;
data = queues[stage_id-1].front();
queues[stage_id-1].pop();
lock.unlock();
cvs[stage_id-1].notify_one();
}
// Process data
stage_func(data);
if(stage_id < stages.size() - 1) {
// Pass to next stage
std::unique_lock<std::mutex> lock(mutexes[stage_id]);
queues[stage_id].push(data);
lock.unlock();
cvs[stage_id].notify_one();
}
}
});
}
void start() {
for(auto& stage : stages) {
stage.join();
}
}
void stop() {
running = false;
for(auto& cv : cvs) {
cv.notify_all();
}
}
};
2. N-body Simulations
Barnes-Hut Algorithm
struct Octree {
struct Node {
vec3 center;
float size;
float mass;
vec3 com;
std::vector<Node*> children;
};
Node* root;
float theta;
__device__ void compute_force(vec3& pos, vec3& force, Node* node) {
vec3 diff = node->com - pos;
float dist = length(diff);
if(node->size / dist < theta || node->children.empty()) {
// Use approximation
float f = G * node->mass / (dist * dist * dist);
force += diff * f;
} else {
// Recurse into children
for(auto child : node->children) {
if(child != nullptr) {
compute_force(pos, force, child);
}
}
}
}
__global__ void update_bodies(vec3* pos, vec3* vel, vec3* acc,
float dt, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < n) {
vec3 force(0.0f);
compute_force(pos[idx], force, root);
acc[idx] = force;
vel[idx] += acc[idx] * dt;
pos[idx] += vel[idx] * dt;
}
}
};
3. Matrix Computations
Parallel Matrix Factorization
__global__ void lu_factorization(float* A, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if(row > k && row < n && col > k && col < n) {
A[row * n + col] -= A[row * n + k] * A[k * n + col] / A[k * n + k];
}
}
void parallel_lu(float* A, int n) {
dim3 block(16, 16);
dim3 grid((n + block.x - 1) / block.x,
(n + block.y - 1) / block.y);
for(int k = 0; k < n-1; k++) {
lu_factorization<<<grid, block>>>(A, n, k);
cudaDeviceSynchronize();
}
}
4. Big Data Processing
Parallel Data Analysis
template<typename T>
class ParallelDataProcessor {
private:
std::vector<T> data;
int num_threads;
public:
ParallelDataProcessor(const std::vector<T>& input, int threads)
: data(input), num_threads(threads) {}
template<typename Func>
std::vector<T> map(Func f) {
std::vector<T> result(data.size());
#pragma omp parallel for num_threads(num_threads)
for(size_t i = 0; i < data.size(); i++) {
result[i] = f(data[i]);
}
return result;
}
template<typename Func>
T reduce(Func f, T initial) {
T result = initial;
#pragma omp parallel num_threads(num_threads)
{
T local_sum = initial;
#pragma omp for nowait
for(size_t i = 0; i < data.size(); i++) {
local_sum = f(local_sum, data[i]);
}
#pragma omp critical
{
result = f(result, local_sum);
}
}
return result;
}
};
Lab Exercise
Tasks
- Implement Barnes-Hut simulation
- Develop parallel LU factorization
- Create big data processing pipeline
- Analyze performance characteristics
- Algorithm complexity
- Memory access patterns
- Load balancing
- Scalability testing
Resources
Documentation
- Advanced CUDA Programming Guide
- Parallel Algorithms Reference
- Scientific Computing Libraries
- Performance Profilers
- Debugging Tools
- Analysis Frameworks
Questions & Discussion