High-performance computing (HPC) on Linux platforms requires sophisticated understanding of parallel programming paradigms, GPU architectures, and distributed computing frameworks. This comprehensive guide explores advanced HPC techniques, from CUDA and OpenCL programming to building scalable GPU clusters and optimizing computational workloads.

High-Performance Computing and GPU Programming on Linux

CUDA Programming and GPU Computing Framework

Advanced CUDA Development Environment

// cuda_framework.c - Advanced CUDA programming framework
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <curand.h>
#include <cufft.h>
#include <cudnn.h>
#include <nvml.h>
#include <mpi.h>
#include <omp.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t error = call; \
        if (error != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d - %s\n", __FILE__, __LINE__, \
                    cudaGetErrorString(error)); \
            exit(1); \
        } \
    } while(0)

#define CUBLAS_CHECK(call) \
    do { \
        cublasStatus_t status = call; \
        if (status != CUBLAS_STATUS_SUCCESS) { \
            fprintf(stderr, "cuBLAS error at %s:%d - %d\n", __FILE__, __LINE__, status); \
            exit(1); \
        } \
    } while(0)

#define MAX_GPUS 16
#define WARP_SIZE 32
#define MAX_THREADS_PER_BLOCK 1024
#define MEMORY_ALIGNMENT 256

// GPU device information structure
typedef struct {
    int device_id;
    char name[256];
    size_t total_memory;
    size_t free_memory;
    int compute_capability_major;
    int compute_capability_minor;
    int multiprocessor_count;
    int max_threads_per_multiprocessor;
    int max_threads_per_block;
    int max_shared_memory_per_block;
    int warp_size;
    bool unified_addressing;
    bool concurrent_kernels;
    int memory_bus_width;
    int memory_clock_rate;
    float memory_bandwidth_gb_s;
} gpu_device_info_t;

// CUDA context management
typedef struct {
    int num_devices;
    gpu_device_info_t devices[MAX_GPUS];
    cudaStream_t streams[MAX_GPUS];
    cublasHandle_t cublas_handles[MAX_GPUS];
    curandGenerator_t curand_generators[MAX_GPUS];
    bool initialized;
} cuda_context_t;

static cuda_context_t cuda_ctx = {0};

// Memory pool for GPU allocations
typedef struct memory_block {
    void *ptr;
    size_t size;
    bool in_use;
    int device_id;
    struct memory_block *next;
} memory_block_t;

typedef struct {
    memory_block_t *blocks;
    size_t total_allocated;
    size_t total_free;
    pthread_mutex_t mutex;
} memory_pool_t;

static memory_pool_t memory_pool = {0};

// Performance monitoring structure
typedef struct {
    double kernel_time_ms;
    double memory_transfer_time_ms;
    double total_time_ms;
    size_t bytes_transferred;
    float gpu_utilization;
    float memory_utilization;
    int sm_occupancy;
} performance_metrics_t;

// Initialize CUDA framework
int init_cuda_framework(void) {
    int device_count;
    
    printf("Initializing CUDA framework...\n");
    
    // Get device count
    CUDA_CHECK(cudaGetDeviceCount(&device_count));
    
    if (device_count == 0) {
        fprintf(stderr, "No CUDA devices found\n");
        return -1;
    }
    
    cuda_ctx.num_devices = device_count;
    
    // Initialize each device
    for (int i = 0; i < device_count; i++) {
        CUDA_CHECK(cudaSetDevice(i));
        
        gpu_device_info_t *dev = &cuda_ctx.devices[i];
        dev->device_id = i;
        
        // Get device properties
        cudaDeviceProp prop;
        CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
        
        strncpy(dev->name, prop.name, sizeof(dev->name) - 1);
        dev->total_memory = prop.totalGlobalMem;
        dev->compute_capability_major = prop.major;
        dev->compute_capability_minor = prop.minor;
        dev->multiprocessor_count = prop.multiProcessorCount;
        dev->max_threads_per_multiprocessor = prop.maxThreadsPerMultiProcessor;
        dev->max_threads_per_block = prop.maxThreadsPerBlock;
        dev->max_shared_memory_per_block = prop.sharedMemPerBlock;
        dev->warp_size = prop.warpSize;
        dev->unified_addressing = prop.unifiedAddressing;
        dev->concurrent_kernels = prop.concurrentKernels;
        dev->memory_bus_width = prop.memoryBusWidth;
        dev->memory_clock_rate = prop.memoryClockRate;
        
        // Calculate memory bandwidth
        dev->memory_bandwidth_gb_s = 2.0 * prop.memoryClockRate * 
                                    (prop.memoryBusWidth / 8) / 1.0e6;
        
        // Get current memory info
        size_t free_mem, total_mem;
        CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
        dev->free_memory = free_mem;
        
        // Create streams
        CUDA_CHECK(cudaStreamCreate(&cuda_ctx.streams[i]));
        
        // Create cuBLAS handle
        CUBLAS_CHECK(cublasCreate(&cuda_ctx.cublas_handles[i]));
        CUBLAS_CHECK(cublasSetStream(cuda_ctx.cublas_handles[i], cuda_ctx.streams[i]));
        
        // Create cuRAND generator
        curandCreateGenerator(&cuda_ctx.curand_generators[i], CURAND_RNG_PSEUDO_DEFAULT);
        curandSetStream(cuda_ctx.curand_generators[i], cuda_ctx.streams[i]);
        
        printf("GPU %d: %s\n", i, dev->name);
        printf("  Compute Capability: %d.%d\n", 
               dev->compute_capability_major, dev->compute_capability_minor);
        printf("  Memory: %.1f GB (%.1f GB free)\n", 
               dev->total_memory / 1e9, dev->free_memory / 1e9);
        printf("  SMs: %d, Max threads/SM: %d\n",
               dev->multiprocessor_count, dev->max_threads_per_multiprocessor);
        printf("  Memory Bandwidth: %.1f GB/s\n", dev->memory_bandwidth_gb_s);
    }
    
    // Initialize memory pool
    pthread_mutex_init(&memory_pool.mutex, NULL);
    
    cuda_ctx.initialized = true;
    printf("CUDA framework initialized with %d devices\n", device_count);
    
    return 0;
}

// Advanced memory management
void* cuda_malloc_managed(size_t size, int device_id) {
    void *ptr;
    
    pthread_mutex_lock(&memory_pool.mutex);
    
    // Try to find existing free block
    memory_block_t *block = memory_pool.blocks;
    while (block) {
        if (!block->in_use && block->size >= size && block->device_id == device_id) {
            block->in_use = true;
            pthread_mutex_unlock(&memory_pool.mutex);
            return block->ptr;
        }
        block = block->next;
    }
    
    // Allocate new block
    CUDA_CHECK(cudaSetDevice(device_id));
    CUDA_CHECK(cudaMallocManaged(&ptr, size));
    
    // Add to memory pool
    block = malloc(sizeof(memory_block_t));
    block->ptr = ptr;
    block->size = size;
    block->in_use = true;
    block->device_id = device_id;
    block->next = memory_pool.blocks;
    memory_pool.blocks = block;
    memory_pool.total_allocated += size;
    
    pthread_mutex_unlock(&memory_pool.mutex);
    
    return ptr;
}

void cuda_free_managed(void *ptr) {
    pthread_mutex_lock(&memory_pool.mutex);
    
    memory_block_t *block = memory_pool.blocks;
    while (block) {
        if (block->ptr == ptr) {
            block->in_use = false;
            memory_pool.total_free += block->size;
            break;
        }
        block = block->next;
    }
    
    pthread_mutex_unlock(&memory_pool.mutex);
}

// CUDA kernel for matrix multiplication with optimizations
__global__ void matrix_multiply_optimized(const float *A, const float *B, float *C,
                                         int M, int N, int K, int tile_size) {
    // Shared memory for tiles
    extern __shared__ float shared_mem[];
    float *tile_A = shared_mem;
    float *tile_B = &shared_mem[tile_size * tile_size];
    
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;
    
    // Calculate global indices
    int row = by * tile_size + ty;
    int col = bx * tile_size + tx;
    
    float sum = 0.0f;
    
    // Loop over tiles
    for (int t = 0; t < (K + tile_size - 1) / tile_size; ++t) {
        // Load tile into shared memory
        int a_row = row;
        int a_col = t * tile_size + tx;
        int b_row = t * tile_size + ty;
        int b_col = col;
        
        if (a_row < M && a_col < K) {
            tile_A[ty * tile_size + tx] = A[a_row * K + a_col];
        } else {
            tile_A[ty * tile_size + tx] = 0.0f;
        }
        
        if (b_row < K && b_col < N) {
            tile_B[ty * tile_size + tx] = B[b_row * N + b_col];
        } else {
            tile_B[ty * tile_size + tx] = 0.0f;
        }
        
        __syncthreads();
        
        // Compute partial sum for this tile
        for (int k = 0; k < tile_size; ++k) {
            sum += tile_A[ty * tile_size + k] * tile_B[k * tile_size + tx];
        }
        
        __syncthreads();
    }
    
    // Write result
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

// CUDA kernel for vector reduction with warp-level primitives
__global__ void vector_reduce_optimized(const float *input, float *output, int n) {
    extern __shared__ float sdata[];
    
    int tid = threadIdx.x;
    int bid = blockIdx.x;
    int grid_size = gridDim.x * blockDim.x;
    int global_tid = bid * blockDim.x + tid;
    
    // Grid-stride loop for loading
    float sum = 0.0f;
    for (int i = global_tid; i < n; i += grid_size) {
        sum += input[i];
    }
    sdata[tid] = sum;
    
    __syncthreads();
    
    // Warp-level reduction
    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    
    // Final warp reduction using shuffle
    if (tid < 32) {
        float warp_sum = sdata[tid];
        for (int offset = 16; offset > 0; offset >>= 1) {
            warp_sum += __shfl_down_sync(0xffffffff, warp_sum, offset);
        }
        
        if (tid == 0) {
            output[bid] = warp_sum;
        }
    }
}

// FFT-based convolution using cuFFT
int fft_convolution(const float *signal, const float *kernel, float *result,
                   int signal_size, int kernel_size, int device_id) {
    CUDA_CHECK(cudaSetDevice(device_id));
    
    int conv_size = signal_size + kernel_size - 1;
    int fft_size = 1;
    while (fft_size < conv_size) fft_size <<= 1;
    
    // Allocate GPU memory
    cufftComplex *d_signal, *d_kernel, *d_result;
    CUDA_CHECK(cudaMalloc(&d_signal, fft_size * sizeof(cufftComplex)));
    CUDA_CHECK(cudaMalloc(&d_kernel, fft_size * sizeof(cufftComplex)));
    CUDA_CHECK(cudaMalloc(&d_result, fft_size * sizeof(cufftComplex)));
    
    // Copy and pad data
    float *h_signal_padded = calloc(fft_size, sizeof(float));
    float *h_kernel_padded = calloc(fft_size, sizeof(float));
    
    memcpy(h_signal_padded, signal, signal_size * sizeof(float));
    memcpy(h_kernel_padded, kernel, kernel_size * sizeof(float));
    
    // Convert to complex
    cufftComplex *h_signal_complex = malloc(fft_size * sizeof(cufftComplex));
    cufftComplex *h_kernel_complex = malloc(fft_size * sizeof(cufftComplex));
    
    for (int i = 0; i < fft_size; i++) {
        h_signal_complex[i].x = h_signal_padded[i];
        h_signal_complex[i].y = 0.0f;
        h_kernel_complex[i].x = h_kernel_padded[i];
        h_kernel_complex[i].y = 0.0f;
    }
    
    CUDA_CHECK(cudaMemcpy(d_signal, h_signal_complex, 
                         fft_size * sizeof(cufftComplex), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_kernel, h_kernel_complex,
                         fft_size * sizeof(cufftComplex), cudaMemcpyHostToDevice));
    
    // Create FFT plans
    cufftHandle plan;
    cufftPlan1d(&plan, fft_size, CUFFT_C2C, 1);
    
    // Forward FFTs
    cufftExecC2C(plan, d_signal, d_signal, CUFFT_FORWARD);
    cufftExecC2C(plan, d_kernel, d_kernel, CUFFT_FORWARD);
    
    // Element-wise multiplication
    dim3 block(256);
    dim3 grid((fft_size + block.x - 1) / block.x);
    
    // Complex multiplication kernel
    auto complex_mult = [] __device__ (cufftComplex a, cufftComplex b) -> cufftComplex {
        cufftComplex result;
        result.x = a.x * b.x - a.y * b.y;
        result.y = a.x * b.y + a.y * b.x;
        return result;
    };
    
    // Launch kernel for complex multiplication
    // ... (kernel implementation for complex multiplication)
    
    // Inverse FFT
    cufftExecC2C(plan, d_result, d_result, CUFFT_INVERSE);
    
    // Copy result back
    cufftComplex *h_result_complex = malloc(fft_size * sizeof(cufftComplex));
    CUDA_CHECK(cudaMemcpy(h_result_complex, d_result,
                         fft_size * sizeof(cufftComplex), cudaMemcpyDeviceToHost));
    
    // Extract real part and normalize
    for (int i = 0; i < conv_size; i++) {
        result[i] = h_result_complex[i].x / fft_size;
    }
    
    // Cleanup
    cufftDestroy(plan);
    cudaFree(d_signal);
    cudaFree(d_kernel);
    cudaFree(d_result);
    free(h_signal_padded);
    free(h_kernel_padded);
    free(h_signal_complex);
    free(h_kernel_complex);
    free(h_result_complex);
    
    return 0;
}

// Multi-GPU matrix multiplication
int multi_gpu_matrix_multiply(const float *A, const float *B, float *C,
                             int M, int N, int K) {
    int num_gpus = cuda_ctx.num_devices;
    
    // Distribute work across GPUs
    int rows_per_gpu = M / num_gpus;
    int remainder = M % num_gpus;
    
    // Allocate device memory on each GPU
    float **d_A = malloc(num_gpus * sizeof(float*));
    float **d_B = malloc(num_gpus * sizeof(float*));
    float **d_C = malloc(num_gpus * sizeof(float*));
    
    cudaEvent_t *start_events = malloc(num_gpus * sizeof(cudaEvent_t));
    cudaEvent_t *stop_events = malloc(num_gpus * sizeof(cudaEvent_t));
    
    for (int gpu = 0; gpu < num_gpus; gpu++) {
        CUDA_CHECK(cudaSetDevice(gpu));
        
        int gpu_rows = rows_per_gpu + (gpu < remainder ? 1 : 0);
        
        CUDA_CHECK(cudaMalloc(&d_A[gpu], gpu_rows * K * sizeof(float)));
        CUDA_CHECK(cudaMalloc(&d_B[gpu], K * N * sizeof(float)));
        CUDA_CHECK(cudaMalloc(&d_C[gpu], gpu_rows * N * sizeof(float)));
        
        cudaEventCreate(&start_events[gpu]);
        cudaEventCreate(&stop_events[gpu]);
    }
    
    // Launch kernels on each GPU
    #pragma omp parallel for
    for (int gpu = 0; gpu < num_gpus; gpu++) {
        CUDA_CHECK(cudaSetDevice(gpu));
        
        int start_row = gpu * rows_per_gpu + (gpu < remainder ? gpu : remainder);
        int gpu_rows = rows_per_gpu + (gpu < remainder ? 1 : 0);
        
        // Copy data to GPU
        CUDA_CHECK(cudaMemcpyAsync(d_A[gpu], &A[start_row * K],
                                  gpu_rows * K * sizeof(float),
                                  cudaMemcpyHostToDevice, cuda_ctx.streams[gpu]));
        CUDA_CHECK(cudaMemcpyAsync(d_B[gpu], B, K * N * sizeof(float),
                                  cudaMemcpyHostToDevice, cuda_ctx.streams[gpu]));
        
        // Launch kernel
        cudaEventRecord(start_events[gpu], cuda_ctx.streams[gpu]);
        
        int tile_size = 16;
        dim3 block(tile_size, tile_size);
        dim3 grid((N + tile_size - 1) / tile_size, 
                  (gpu_rows + tile_size - 1) / tile_size);
        
        size_t shared_mem = 2 * tile_size * tile_size * sizeof(float);
        
        matrix_multiply_optimized<<<grid, block, shared_mem, cuda_ctx.streams[gpu]>>>(
            d_A[gpu], d_B[gpu], d_C[gpu], gpu_rows, N, K, tile_size);
        
        cudaEventRecord(stop_events[gpu], cuda_ctx.streams[gpu]);
        
        // Copy result back
        CUDA_CHECK(cudaMemcpyAsync(&C[start_row * N], d_C[gpu],
                                  gpu_rows * N * sizeof(float),
                                  cudaMemcpyDeviceToHost, cuda_ctx.streams[gpu]));
    }
    
    // Wait for all GPUs to complete
    for (int gpu = 0; gpu < num_gpus; gpu++) {
        CUDA_CHECK(cudaSetDevice(gpu));
        CUDA_CHECK(cudaStreamSynchronize(cuda_ctx.streams[gpu]));
        
        float gpu_time;
        cudaEventElapsedTime(&gpu_time, start_events[gpu], stop_events[gpu]);
        printf("GPU %d computation time: %.2f ms\n", gpu, gpu_time);
    }
    
    // Cleanup
    for (int gpu = 0; gpu < num_gpus; gpu++) {
        CUDA_CHECK(cudaSetDevice(gpu));
        cudaFree(d_A[gpu]);
        cudaFree(d_B[gpu]);
        cudaFree(d_C[gpu]);
        cudaEventDestroy(start_events[gpu]);
        cudaEventDestroy(stop_events[gpu]);
    }
    
    free(d_A);
    free(d_B);
    free(d_C);
    free(start_events);
    free(stop_events);
    
    return 0;
}

// GPU performance monitoring
performance_metrics_t measure_gpu_performance(int device_id, 
                                             void (*kernel_func)(void*), 
                                             void *kernel_args) {
    performance_metrics_t metrics = {0};
    
    CUDA_CHECK(cudaSetDevice(device_id));
    
    // Create events for timing
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    
    // Measure kernel execution time
    cudaEventRecord(start);
    kernel_func(kernel_args);
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    
    float kernel_time;
    cudaEventElapsedTime(&kernel_time, start, stop);
    metrics.kernel_time_ms = kernel_time;
    
    // Get GPU utilization using NVML
    nvmlDevice_t nvml_device;
    nvmlInit();
    nvmlDeviceGetHandleByIndex(device_id, &nvml_device);
    
    nvmlUtilization_t utilization;
    nvmlDeviceGetUtilizationRates(nvml_device, &utilization);
    metrics.gpu_utilization = utilization.gpu;
    metrics.memory_utilization = utilization.memory;
    
    nvmlShutdown();
    
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    
    return metrics;
}

// Optimize kernel launch parameters
void optimize_kernel_config(int device_id, void *kernel_func, 
                           int *optimal_block_size, int *optimal_grid_size,
                           size_t dynamic_shared_mem) {
    CUDA_CHECK(cudaSetDevice(device_id));
    
    int min_grid_size, block_size;
    
    // Use CUDA occupancy API
    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size,
                                      (const void*)kernel_func,
                                      dynamic_shared_mem, 0);
    
    *optimal_block_size = block_size;
    *optimal_grid_size = min_grid_size;
    
    printf("Optimal configuration for GPU %d:\n", device_id);
    printf("  Block size: %d\n", block_size);
    printf("  Min grid size: %d\n", min_grid_size);
    
    // Calculate theoretical occupancy
    int max_active_blocks;
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks,
                                                 (const void*)kernel_func,
                                                 block_size, dynamic_shared_mem);
    
    float occupancy = (float)max_active_blocks * block_size / 
                     cuda_ctx.devices[device_id].max_threads_per_multiprocessor;
    
    printf("  Theoretical occupancy: %.1f%%\n", occupancy * 100);
}

OpenCL Cross-Platform Computing Framework

Comprehensive OpenCL Development Environment

// opencl_framework.c - Advanced OpenCL programming framework
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <pthread.h>
#include <time.h>

#define CL_CHECK(err) \
    do { \
        if (err != CL_SUCCESS) { \
            fprintf(stderr, "OpenCL error %d at %s:%d\n", err, __FILE__, __LINE__); \
            exit(1); \
        } \
    } while(0)

#define MAX_PLATFORMS 8
#define MAX_DEVICES 32
#define MAX_KERNELS 128

typedef struct {
    cl_platform_id platform_id;
    char name[256];
    char vendor[256];
    char version[256];
    char extensions[2048];
} opencl_platform_info_t;

typedef struct {
    cl_device_id device_id;
    cl_device_type type;
    char name[256];
    char vendor[256];
    cl_uint compute_units;
    size_t max_work_group_size;
    cl_uint max_work_item_dimensions;
    size_t *max_work_item_sizes;
    size_t global_mem_size;
    size_t local_mem_size;
    size_t max_constant_buffer_size;
    cl_bool unified_memory;
    cl_uint preferred_vector_width_float;
    cl_uint native_vector_width_float;
    char extensions[2048];
} opencl_device_info_t;

typedef struct {
    cl_context context;
    cl_command_queue queue;
    cl_program program;
    cl_kernel kernels[MAX_KERNELS];
    int num_kernels;
    opencl_device_info_t device_info;
} opencl_context_t;

typedef struct {
    int num_platforms;
    opencl_platform_info_t platforms[MAX_PLATFORMS];
    int num_devices;
    opencl_device_info_t devices[MAX_DEVICES];
    opencl_context_t contexts[MAX_DEVICES];
    bool initialized;
} opencl_framework_t;

static opencl_framework_t ocl_fw = {0};

// Initialize OpenCL framework
int init_opencl_framework(void) {
    cl_int err;
    cl_uint num_platforms, num_devices;
    
    printf("Initializing OpenCL framework...\n");
    
    // Get platforms
    err = clGetPlatformIDs(MAX_PLATFORMS, NULL, &num_platforms);
    CL_CHECK(err);
    
    if (num_platforms == 0) {
        fprintf(stderr, "No OpenCL platforms found\n");
        return -1;
    }
    
    cl_platform_id platforms[MAX_PLATFORMS];
    err = clGetPlatformIDs(num_platforms, platforms, NULL);
    CL_CHECK(err);
    
    ocl_fw.num_platforms = num_platforms;
    
    // Get platform information
    for (int i = 0; i < num_platforms; i++) {
        opencl_platform_info_t *platform = &ocl_fw.platforms[i];
        platform->platform_id = platforms[i];
        
        clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 
                         sizeof(platform->name), platform->name, NULL);
        clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR,
                         sizeof(platform->vendor), platform->vendor, NULL);
        clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION,
                         sizeof(platform->version), platform->version, NULL);
        clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS,
                         sizeof(platform->extensions), platform->extensions, NULL);
        
        printf("Platform %d: %s (%s)\n", i, platform->name, platform->vendor);
        
        // Get devices for this platform
        cl_uint platform_devices;
        err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &platform_devices);
        if (err == CL_SUCCESS && platform_devices > 0) {
            cl_device_id *device_ids = malloc(platform_devices * sizeof(cl_device_id));
            err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 
                               platform_devices, device_ids, NULL);
            CL_CHECK(err);
            
            for (int j = 0; j < platform_devices && ocl_fw.num_devices < MAX_DEVICES; j++) {
                opencl_device_info_t *device = &ocl_fw.devices[ocl_fw.num_devices];
                device->device_id = device_ids[j];
                
                // Get device information
                clGetDeviceInfo(device_ids[j], CL_DEVICE_TYPE,
                               sizeof(device->type), &device->type, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_NAME,
                               sizeof(device->name), device->name, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_VENDOR,
                               sizeof(device->vendor), device->vendor, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_MAX_COMPUTE_UNITS,
                               sizeof(device->compute_units), &device->compute_units, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_MAX_WORK_GROUP_SIZE,
                               sizeof(device->max_work_group_size), &device->max_work_group_size, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_GLOBAL_MEM_SIZE,
                               sizeof(device->global_mem_size), &device->global_mem_size, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_LOCAL_MEM_SIZE,
                               sizeof(device->local_mem_size), &device->local_mem_size, NULL);
                clGetDeviceInfo(device_ids[j], CL_DEVICE_EXTENSIONS,
                               sizeof(device->extensions), device->extensions, NULL);
                
                const char *device_type_str = "Unknown";
                if (device->type & CL_DEVICE_TYPE_CPU) device_type_str = "CPU";
                else if (device->type & CL_DEVICE_TYPE_GPU) device_type_str = "GPU";
                else if (device->type & CL_DEVICE_TYPE_ACCELERATOR) device_type_str = "Accelerator";
                
                printf("  Device %d: %s (%s)\n", ocl_fw.num_devices, device->name, device_type_str);
                printf("    Compute Units: %u\n", device->compute_units);
                printf("    Global Memory: %.1f MB\n", device->global_mem_size / 1e6);
                printf("    Local Memory: %.1f KB\n", device->local_mem_size / 1e3);
                printf("    Max Work Group Size: %zu\n", device->max_work_group_size);
                
                ocl_fw.num_devices++;
            }
            
            free(device_ids);
        }
    }
    
    ocl_fw.initialized = true;
    printf("OpenCL framework initialized with %d platforms and %d devices\n",
           ocl_fw.num_platforms, ocl_fw.num_devices);
    
    return 0;
}

// Create OpenCL context for specific device
int create_opencl_context(int device_index) {
    if (device_index >= ocl_fw.num_devices) {
        fprintf(stderr, "Invalid device index: %d\n", device_index);
        return -1;
    }
    
    opencl_context_t *ctx = &ocl_fw.contexts[device_index];
    opencl_device_info_t *device = &ocl_fw.devices[device_index];
    cl_int err;
    
    // Create context
    ctx->context = clCreateContext(NULL, 1, &device->device_id, NULL, NULL, &err);
    CL_CHECK(err);
    
    // Create command queue
    ctx->queue = clCreateCommandQueueWithProperties(ctx->context, device->device_id, 
                                                   NULL, &err);
    CL_CHECK(err);
    
    // Copy device info
    memcpy(&ctx->device_info, device, sizeof(opencl_device_info_t));
    
    printf("Created OpenCL context for device: %s\n", device->name);
    return 0;
}

// Advanced OpenCL kernels
const char *matrix_multiply_kernel = R"(
__kernel void matrix_multiply_tiled(__global const float* A,
                                   __global const float* B,
                                   __global float* C,
                                   const int M, const int N, const int K,
                                   const int tile_size) {
    __local float tile_A[16][16];
    __local float tile_B[16][16];
    
    int bx = get_group_id(0);
    int by = get_group_id(1);
    int tx = get_local_id(0);
    int ty = get_local_id(1);
    
    int row = by * tile_size + ty;
    int col = bx * tile_size + tx;
    
    float sum = 0.0f;
    
    for (int t = 0; t < (K + tile_size - 1) / tile_size; t++) {
        // Load tiles into local memory
        int a_row = row;
        int a_col = t * tile_size + tx;
        int b_row = t * tile_size + ty;
        int b_col = col;
        
        if (a_row < M && a_col < K) {
            tile_A[ty][tx] = A[a_row * K + a_col];
        } else {
            tile_A[ty][tx] = 0.0f;
        }
        
        if (b_row < K && b_col < N) {
            tile_B[ty][tx] = B[b_row * N + b_col];
        } else {
            tile_B[ty][tx] = 0.0f;
        }
        
        barrier(CLK_LOCAL_MEM_FENCE);
        
        // Compute partial sum
        for (int k = 0; k < tile_size; k++) {
            sum += tile_A[ty][k] * tile_B[k][tx];
        }
        
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    
    // Write result
    if (row < M && col < N) {
        C[row * N + col] = sum;
    }
}

__kernel void vector_add_optimized(__global const float* a,
                                  __global const float* b,
                                  __global float* c,
                                  const int n) {
    int gid = get_global_id(0);
    int grid_size = get_global_size(0);
    
    // Grid-stride loop for better memory access
    for (int i = gid; i < n; i += grid_size) {
        c[i] = a[i] + b[i];
    }
}

__kernel void reduction_optimized(__global const float* input,
                                 __global float* output,
                                 __local float* local_mem,
                                 const int n) {
    int gid = get_global_id(0);
    int lid = get_local_id(0);
    int group_size = get_local_size(0);
    int group_id = get_group_id(0);
    
    // Load data into local memory
    float sum = 0.0f;
    for (int i = gid; i < n; i += get_global_size(0)) {
        sum += input[i];
    }
    local_mem[lid] = sum;
    
    barrier(CLK_LOCAL_MEM_FENCE);
    
    // Reduce within work group
    for (int stride = group_size / 2; stride > 0; stride /= 2) {
        if (lid < stride) {
            local_mem[lid] += local_mem[lid + stride];
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    
    // Write group result
    if (lid == 0) {
        output[group_id] = local_mem[0];
    }
}

__kernel void fft_radix2(__global float2* data,
                        const int n,
                        const int stage,
                        const int direction) {
    int gid = get_global_id(0);
    int pairs = n >> stage;
    int pair_id = gid / (pairs / 2);
    int element_id = gid % (pairs / 2);
    
    if (gid >= pairs / 2) return;
    
    int step = 1 << (stage - 1);
    int idx1 = pair_id * step * 2 + element_id;
    int idx2 = idx1 + step;
    
    float angle = -2.0f * M_PI * element_id / (2 * step) * direction;
    float2 twiddle = (float2)(cos(angle), sin(angle));
    
    float2 a = data[idx1];
    float2 b = data[idx2];
    
    // Complex multiplication: b * twiddle
    float2 b_twiddle;
    b_twiddle.x = b.x * twiddle.x - b.y * twiddle.y;
    b_twiddle.y = b.x * twiddle.y + b.y * twiddle.x;
    
    data[idx1] = a + b_twiddle;
    data[idx2] = a - b_twiddle;
}
)";

// Compile and build OpenCL program
int build_opencl_program(int device_index, const char *source_code) {
    opencl_context_t *ctx = &ocl_fw.contexts[device_index];
    cl_int err;
    
    // Create program from source
    ctx->program = clCreateProgramWithSource(ctx->context, 1, &source_code, NULL, &err);
    CL_CHECK(err);
    
    // Build program
    err = clBuildProgram(ctx->program, 1, &ctx->device_info.device_id, 
                        "-cl-fast-relaxed-math -cl-mad-enable", NULL, NULL);
    
    if (err != CL_SUCCESS) {
        size_t log_size;
        clGetProgramBuildInfo(ctx->program, ctx->device_info.device_id,
                             CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
        
        char *log = malloc(log_size);
        clGetProgramBuildInfo(ctx->program, ctx->device_info.device_id,
                             CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
        
        fprintf(stderr, "Build error:\n%s\n", log);
        free(log);
        return -1;
    }
    
    printf("OpenCL program built successfully for device: %s\n", 
           ctx->device_info.name);
    
    return 0;
}

// Create kernel from built program
cl_kernel create_opencl_kernel(int device_index, const char *kernel_name) {
    opencl_context_t *ctx = &ocl_fw.contexts[device_index];
    cl_int err;
    
    cl_kernel kernel = clCreateKernel(ctx->program, kernel_name, &err);
    CL_CHECK(err);
    
    if (ctx->num_kernels < MAX_KERNELS) {
        ctx->kernels[ctx->num_kernels++] = kernel;
    }
    
    return kernel;
}

// Execute matrix multiplication using OpenCL
int opencl_matrix_multiply(int device_index, const float *A, const float *B, float *C,
                          int M, int N, int K) {
    opencl_context_t *ctx = &ocl_fw.contexts[device_index];
    cl_int err;
    
    // Create buffers
    cl_mem buf_A = clCreateBuffer(ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 M * K * sizeof(float), (void*)A, &err);
    CL_CHECK(err);
    
    cl_mem buf_B = clCreateBuffer(ctx->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                 K * N * sizeof(float), (void*)B, &err);
    CL_CHECK(err);
    
    cl_mem buf_C = clCreateBuffer(ctx->context, CL_MEM_WRITE_ONLY,
                                 M * N * sizeof(float), NULL, &err);
    CL_CHECK(err);
    
    // Create kernel
    cl_kernel kernel = create_opencl_kernel(device_index, "matrix_multiply_tiled");
    
    // Set kernel arguments
    int tile_size = 16;
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_A);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_B);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_C);
    clSetKernelArg(kernel, 3, sizeof(int), &M);
    clSetKernelArg(kernel, 4, sizeof(int), &N);
    clSetKernelArg(kernel, 5, sizeof(int), &K);
    clSetKernelArg(kernel, 6, sizeof(int), &tile_size);
    
    // Execute kernel
    size_t global_work_size[2] = {(N + tile_size - 1) / tile_size * tile_size,
                                  (M + tile_size - 1) / tile_size * tile_size};
    size_t local_work_size[2] = {tile_size, tile_size};
    
    cl_event event;
    err = clEnqueueNDRangeKernel(ctx->queue, kernel, 2, NULL,
                                global_work_size, local_work_size, 0, NULL, &event);
    CL_CHECK(err);
    
    // Read result
    err = clEnqueueReadBuffer(ctx->queue, buf_C, CL_TRUE, 0,
                             M * N * sizeof(float), C, 1, &event, NULL);
    CL_CHECK(err);
    
    // Get execution time
    clWaitForEvents(1, &event);
    cl_ulong start_time, end_time;
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
                           sizeof(start_time), &start_time, NULL);
    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
                           sizeof(end_time), &end_time, NULL);
    
    double execution_time = (end_time - start_time) / 1e6; // Convert to ms
    printf("Matrix multiplication execution time: %.2f ms\n", execution_time);
    
    // Cleanup
    clReleaseMemObject(buf_A);
    clReleaseMemObject(buf_B);
    clReleaseMemObject(buf_C);
    clReleaseKernel(kernel);
    clReleaseEvent(event);
    
    return 0;
}

// Performance benchmarking
void benchmark_opencl_device(int device_index) {
    printf("\n=== Benchmarking Device: %s ===\n", 
           ocl_fw.devices[device_index].name);
    
    // Matrix multiplication benchmark
    int sizes[] = {512, 1024, 2048};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
    
    for (int i = 0; i < num_sizes; i++) {
        int size = sizes[i];
        printf("\nMatrix size: %dx%d\n", size, size);
        
        // Allocate matrices
        float *A = malloc(size * size * sizeof(float));
        float *B = malloc(size * size * sizeof(float));
        float *C = malloc(size * size * sizeof(float));
        
        // Initialize with random data
        for (int j = 0; j < size * size; j++) {
            A[j] = (float)rand() / RAND_MAX;
            B[j] = (float)rand() / RAND_MAX;
        }
        
        // Benchmark
        struct timespec start, end;
        clock_gettime(CLOCK_MONOTONIC, &start);
        
        opencl_matrix_multiply(device_index, A, B, C, size, size, size);
        
        clock_gettime(CLOCK_MONOTONIC, &end);
        
        double total_time = (end.tv_sec - start.tv_sec) + 
                           (end.tv_nsec - start.tv_nsec) / 1e9;
        
        double gflops = (2.0 * size * size * size) / (total_time * 1e9);
        
        printf("Total time: %.3f seconds\n", total_time);
        printf("Performance: %.1f GFLOPS\n", gflops);
        
        free(A);
        free(B);
        free(C);
    }
}

MPI and Distributed Computing Integration

Advanced MPI Framework for GPU Clusters

// mpi_gpu_framework.c - MPI framework for distributed GPU computing
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <cuda_runtime.h>
#include <nccl.h>
#include <time.h>

#define MPI_CHECK(call) \
    do { \
        int err = call; \
        if (err != MPI_SUCCESS) { \
            char error_string[MPI_MAX_ERROR_STRING]; \
            int length; \
            MPI_Error_string(err, error_string, &length); \
            fprintf(stderr, "MPI error at %s:%d - %s\n", __FILE__, __LINE__, error_string); \
            exit(1); \
        } \
    } while(0)

#define NCCL_CHECK(call) \
    do { \
        ncclResult_t result = call; \
        if (result != ncclSuccess) { \
            fprintf(stderr, "NCCL error at %s:%d - %s\n", __FILE__, __LINE__, \
                    ncclGetErrorString(result)); \
            exit(1); \
        } \
    } while(0)

typedef struct {
    int rank;
    int size;
    int local_rank;
    int local_size;
    char hostname[MPI_MAX_PROCESSOR_NAME];
    int num_gpus;
    int *gpu_ids;
    cudaStream_t *streams;
    ncclComm_t nccl_comm;
    bool nccl_initialized;
} mpi_gpu_context_t;

static mpi_gpu_context_t mpi_ctx = {0};

// Initialize MPI and GPU environment
int init_mpi_gpu_framework(int argc, char **argv) {
    int provided;
    
    // Initialize MPI with thread support
    MPI_CHECK(MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided));
    
    if (provided < MPI_THREAD_MULTIPLE) {
        fprintf(stderr, "Warning: MPI does not provide full thread support\n");
    }
    
    // Get MPI rank and size
    MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &mpi_ctx.rank));
    MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &mpi_ctx.size));
    
    // Get processor name
    int name_len;
    MPI_CHECK(MPI_Get_processor_name(mpi_ctx.hostname, &name_len));
    
    // Determine local rank and size
    MPI_Comm local_comm;
    MPI_CHECK(MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED,
                                  mpi_ctx.rank, MPI_INFO_NULL, &local_comm));
    MPI_CHECK(MPI_Comm_rank(local_comm, &mpi_ctx.local_rank));
    MPI_CHECK(MPI_Comm_size(local_comm, &mpi_ctx.local_size));
    
    // Initialize CUDA and get GPU count
    CUDA_CHECK(cudaGetDeviceCount(&mpi_ctx.num_gpus));
    
    if (mpi_ctx.num_gpus == 0) {
        fprintf(stderr, "No CUDA devices found on rank %d\n", mpi_ctx.rank);
        return -1;
    }
    
    // Assign GPUs to local ranks
    mpi_ctx.gpu_ids = malloc(mpi_ctx.num_gpus * sizeof(int));
    mpi_ctx.streams = malloc(mpi_ctx.num_gpus * sizeof(cudaStream_t));
    
    for (int i = 0; i < mpi_ctx.num_gpus; i++) {
        mpi_ctx.gpu_ids[i] = (mpi_ctx.local_rank + i) % mpi_ctx.num_gpus;
        CUDA_CHECK(cudaSetDevice(mpi_ctx.gpu_ids[i]));
        CUDA_CHECK(cudaStreamCreate(&mpi_ctx.streams[i]));
    }
    
    // Set primary GPU for this rank
    CUDA_CHECK(cudaSetDevice(mpi_ctx.gpu_ids[0]));
    
    printf("Rank %d/%d (%s): Local rank %d/%d, GPU %d\n",
           mpi_ctx.rank, mpi_ctx.size, mpi_ctx.hostname,
           mpi_ctx.local_rank, mpi_ctx.local_size, mpi_ctx.gpu_ids[0]);
    
    MPI_Comm_free(&local_comm);
    
    return 0;
}

// Initialize NCCL for GPU communication
int init_nccl_communication(void) {
    ncclUniqueId nccl_id;
    
    // Generate NCCL unique ID on rank 0
    if (mpi_ctx.rank == 0) {
        NCCL_CHECK(ncclGetUniqueId(&nccl_id));
    }
    
    // Broadcast NCCL ID to all ranks
    MPI_CHECK(MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
    
    // Initialize NCCL communicator
    NCCL_CHECK(ncclCommInitRank(&mpi_ctx.nccl_comm, mpi_ctx.size, nccl_id, mpi_ctx.rank));
    
    mpi_ctx.nccl_initialized = true;
    
    printf("NCCL initialized on rank %d\n", mpi_ctx.rank);
    return 0;
}

// Distributed matrix multiplication using MPI+CUDA
int distributed_matrix_multiply(const float *A, const float *B, float *C,
                               int M, int N, int K) {
    // Calculate data distribution
    int rows_per_rank = M / mpi_ctx.size;
    int remainder = M % mpi_ctx.size;
    int my_rows = rows_per_rank + (mpi_ctx.rank < remainder ? 1 : 0);
    int my_start_row = mpi_ctx.rank * rows_per_rank + 
                       (mpi_ctx.rank < remainder ? mpi_ctx.rank : remainder);
    
    // Allocate GPU memory
    float *d_A, *d_B, *d_C;
    CUDA_CHECK(cudaMalloc(&d_A, my_rows * K * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_B, K * N * sizeof(float)));
    CUDA_CHECK(cudaMalloc(&d_C, my_rows * N * sizeof(float)));
    
    // Copy data to GPU
    CUDA_CHECK(cudaMemcpy(d_A, &A[my_start_row * K], 
                         my_rows * K * sizeof(float), cudaMemcpyHostToDevice));
    
    // Broadcast matrix B to all ranks
    if (mpi_ctx.rank == 0) {
        CUDA_CHECK(cudaMemcpy(d_B, B, K * N * sizeof(float), cudaMemcpyHostToDevice));
    }
    
    // Use NCCL to broadcast B across all GPUs
    if (mpi_ctx.nccl_initialized) {
        NCCL_CHECK(ncclBcast(d_B, K * N, ncclFloat, 0, mpi_ctx.nccl_comm, mpi_ctx.streams[0]));
        CUDA_CHECK(cudaStreamSynchronize(mpi_ctx.streams[0]));
    } else {
        // Fallback to MPI broadcast
        float *h_B = malloc(K * N * sizeof(float));
        if (mpi_ctx.rank == 0) {
            memcpy(h_B, B, K * N * sizeof(float));
        }
        MPI_CHECK(MPI_Bcast(h_B, K * N, MPI_FLOAT, 0, MPI_COMM_WORLD));
        CUDA_CHECK(cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice));
        free(h_B);
    }
    
    // Launch matrix multiplication kernel
    int tile_size = 16;
    dim3 block(tile_size, tile_size);
    dim3 grid((N + tile_size - 1) / tile_size, 
              (my_rows + tile_size - 1) / tile_size);
    
    // Use the optimized kernel from CUDA framework
    size_t shared_mem = 2 * tile_size * tile_size * sizeof(float);
    
    // Record start time
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);
    
    matrix_multiply_optimized<<<grid, block, shared_mem>>>(d_A, d_B, d_C, my_rows, N, K, tile_size);
    
    cudaEventRecord(stop);
    CUDA_CHECK(cudaDeviceSynchronize());
    
    // Get execution time
    float gpu_time;
    cudaEventElapsedTime(&gpu_time, start, stop);
    
    // Copy result back to host
    CUDA_CHECK(cudaMemcpy(&C[my_start_row * N], d_C, 
                         my_rows * N * sizeof(float), cudaMemcpyDeviceToHost));
    
    // Gather all results
    int *recvcounts = malloc(mpi_ctx.size * sizeof(int));
    int *displs = malloc(mpi_ctx.size * sizeof(int));
    
    for (int i = 0; i < mpi_ctx.size; i++) {
        int rank_rows = rows_per_rank + (i < remainder ? 1 : 0);
        recvcounts[i] = rank_rows * N;
        displs[i] = (i * rows_per_rank + (i < remainder ? i : remainder)) * N;
    }
    
    MPI_CHECK(MPI_Allgatherv(&C[my_start_row * N], my_rows * N, MPI_FLOAT,
                            C, recvcounts, displs, MPI_FLOAT, MPI_COMM_WORLD));
    
    // Calculate performance metrics
    double total_gflops = (2.0 * M * N * K) / (gpu_time / 1000.0) / 1e9;
    
    if (mpi_ctx.rank == 0) {
        printf("Distributed matrix multiplication completed\n");
        printf("GPU computation time: %.2f ms\n", gpu_time);
        printf("Total performance: %.1f GFLOPS\n", total_gflops);
    }
    
    // Cleanup
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    free(recvcounts);
    free(displs);
    
    return 0;
}

// All-reduce operation using NCCL
int gpu_allreduce(float *data, size_t count) {
    if (!mpi_ctx.nccl_initialized) {
        fprintf(stderr, "NCCL not initialized\n");
        return -1;
    }
    
    float *d_data;
    CUDA_CHECK(cudaMalloc(&d_data, count * sizeof(float)));
    CUDA_CHECK(cudaMemcpy(d_data, data, count * sizeof(float), cudaMemcpyHostToDevice));
    
    // Perform all-reduce
    NCCL_CHECK(ncclAllReduce(d_data, d_data, count, ncclFloat, ncclSum,
                            mpi_ctx.nccl_comm, mpi_ctx.streams[0]));
    CUDA_CHECK(cudaStreamSynchronize(mpi_ctx.streams[0]));
    
    CUDA_CHECK(cudaMemcpy(data, d_data, count * sizeof(float), cudaMemcpyDeviceToHost));
    cudaFree(d_data);
    
    return 0;
}

// Parallel reduction across all ranks
float parallel_sum_reduction(const float *data, size_t local_count) {
    float local_sum = 0.0f;
    
    // Local reduction
    #pragma omp parallel for reduction(+:local_sum)
    for (size_t i = 0; i < local_count; i++) {
        local_sum += data[i];
    }
    
    // Global reduction
    float global_sum;
    MPI_CHECK(MPI_Allreduce(&local_sum, &global_sum, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
    
    return global_sum;
}

// Performance benchmarking for MPI+GPU
void benchmark_mpi_gpu_performance(void) {
    if (mpi_ctx.rank == 0) {
        printf("\n=== MPI+GPU Performance Benchmark ===\n");
    }
    
    int sizes[] = {1024, 2048, 4096};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
    
    for (int i = 0; i < num_sizes; i++) {
        int size = sizes[i];
        
        // Allocate test matrices
        float *A = malloc(size * size * sizeof(float));
        float *B = malloc(size * size * sizeof(float));
        float *C = malloc(size * size * sizeof(float));
        
        // Initialize with random data
        for (int j = 0; j < size * size; j++) {
            A[j] = (float)rand() / RAND_MAX;
            B[j] = (float)rand() / RAND_MAX;
        }
        
        MPI_Barrier(MPI_COMM_WORLD);
        
        double start_time = MPI_Wtime();
        distributed_matrix_multiply(A, B, C, size, size, size);
        double end_time = MPI_Wtime();
        
        if (mpi_ctx.rank == 0) {
            double total_time = end_time - start_time;
            double total_gflops = (2.0 * size * size * size) / total_time / 1e9;
            
            printf("\nMatrix size: %dx%d\n", size, size);
            printf("Total time: %.3f seconds\n", total_time);
            printf("Aggregate performance: %.1f GFLOPS\n", total_gflops);
            printf("Performance per rank: %.1f GFLOPS\n", total_gflops / mpi_ctx.size);
        }
        
        free(A);
        free(B);
        free(C);
    }
}

// Cleanup MPI and GPU resources
void cleanup_mpi_gpu_framework(void) {
    // Cleanup NCCL
    if (mpi_ctx.nccl_initialized) {
        ncclCommDestroy(mpi_ctx.nccl_comm);
    }
    
    // Cleanup CUDA streams
    for (int i = 0; i < mpi_ctx.num_gpus; i++) {
        CUDA_CHECK(cudaSetDevice(mpi_ctx.gpu_ids[i]));
        CUDA_CHECK(cudaStreamDestroy(mpi_ctx.streams[i]));
    }
    
    free(mpi_ctx.gpu_ids);
    free(mpi_ctx.streams);
    
    // Finalize MPI
    MPI_Finalize();
    
    if (mpi_ctx.rank == 0) {
        printf("MPI+GPU framework cleanup completed\n");
    }
}

// Main function for testing
int main(int argc, char **argv) {
    // Initialize MPI and GPU framework
    if (init_mpi_gpu_framework(argc, argv) < 0) {
        return 1;
    }
    
    // Initialize NCCL for GPU communication
    if (init_nccl_communication() < 0) {
        return 1;
    }
    
    // Run performance benchmarks
    benchmark_mpi_gpu_performance();
    
    // Test all-reduce operation
    if (mpi_ctx.rank == 0) {
        printf("\n=== Testing GPU All-Reduce ===\n");
    }
    
    size_t test_size = 1000000;
    float *test_data = malloc(test_size * sizeof(float));
    
    // Initialize with rank-specific data
    for (size_t i = 0; i < test_size; i++) {
        test_data[i] = (float)mpi_ctx.rank;
    }
    
    double allreduce_start = MPI_Wtime();
    gpu_allreduce(test_data, test_size);
    double allreduce_end = MPI_Wtime();
    
    // Verify result (should be sum of all ranks)
    float expected = (mpi_ctx.size * (mpi_ctx.size - 1)) / 2.0f;
    bool correct = (fabs(test_data[0] - expected) < 1e-6);
    
    if (mpi_ctx.rank == 0) {
        printf("All-reduce time: %.3f ms\n", (allreduce_end - allreduce_start) * 1000);
        printf("Result: %s\n", correct ? "CORRECT" : "INCORRECT");
        printf("Bandwidth: %.1f GB/s\n", 
               (test_size * sizeof(float) * mpi_ctx.size) / 
               (allreduce_end - allreduce_start) / 1e9);
    }
    
    free(test_data);
    
    // Cleanup
    cleanup_mpi_gpu_framework();
    
    return 0;
}

Build and Testing Framework

#!/bin/bash
# hpc_gpu_build_framework.sh - Comprehensive HPC/GPU build and test framework

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
BUILD_DIR="$SCRIPT_DIR/build"
INSTALL_DIR="$SCRIPT_DIR/install"
TEST_DIR="$SCRIPT_DIR/tests"

echo "=== HPC/GPU Computing Build Framework ==="

# Setup environment
setup_environment() {
    echo "Setting up HPC/GPU computing environment..."
    
    mkdir -p "$BUILD_DIR"
    mkdir -p "$INSTALL_DIR"
    mkdir -p "$TEST_DIR"
    
    # Install CUDA development tools
    if ! command -v nvcc &> /dev/null; then
        echo "Installing CUDA development tools..."
        
        # Download and install CUDA toolkit
        cd /tmp
        wget https://developer.download.nvidia.com/compute/cuda/12.0.0/local_installers/cuda_12.0.0_525.60.13_linux.run
        sudo sh cuda_12.0.0_525.60.13_linux.run --silent --toolkit
        
        # Add to PATH
        echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
        echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> ~/.bashrc
        source ~/.bashrc
    fi
    
    # Install OpenCL development headers
    if [ ! -f /usr/include/CL/cl.h ]; then
        echo "Installing OpenCL development headers..."
        sudo apt-get update
        sudo apt-get install -y opencl-headers ocl-icd-opencl-dev
    fi
    
    # Install MPI
    if ! command -v mpicc &> /dev/null; then
        echo "Installing OpenMPI..."
        sudo apt-get install -y openmpi-bin openmpi-common libopenmpi-dev
    fi
    
    # Install NCCL
    if [ ! -f /usr/include/nccl.h ]; then
        echo "Installing NCCL..."
        cd /tmp
        wget https://developer.download.nvidia.com/compute/redist/nccl/v2.15.5/nccl_2.15.5-1+cuda12.0_x86_64.txz
        tar -xf nccl_2.15.5-1+cuda12.0_x86_64.txz
        sudo cp -R nccl_2.15.5-1+cuda12.0_x86_64/* /usr/local/
    fi
    
    # Install cuDNN
    if [ ! -f /usr/local/cuda/include/cudnn.h ]; then
        echo "Installing cuDNN..."
        echo "Please download cuDNN from NVIDIA Developer website and install manually"
    fi
    
    echo "Environment setup completed"
}

# Build CUDA applications
build_cuda_applications() {
    echo "Building CUDA applications..."
    
    cd "$BUILD_DIR"
    
    # Copy source files
    cp "$SCRIPT_DIR"/*.c .
    cp "$SCRIPT_DIR"/*.cu . 2>/dev/null || true
    
    # Build CUDA framework
    echo "Building CUDA framework..."
    nvcc -o cuda_framework cuda_framework.c \
        -lcuda -lcudart -lcublas -lcurand -lcufft -lcudnn -lnvml \
        -fopenmp -lm -lpthread
    
    # Build matrix multiplication benchmark
    cat > matrix_benchmark.cu << 'EOF'
#include "cuda_framework.c"

int main() {
    if (init_cuda_framework() < 0) {
        return 1;
    }
    
    int sizes[] = {512, 1024, 2048};
    int num_sizes = sizeof(sizes) / sizeof(sizes[0]);
    
    for (int i = 0; i < num_sizes; i++) {
        int size = sizes[i];
        printf("\n=== Matrix Size: %dx%d ===\n", size, size);
        
        // Allocate matrices
        float *A = malloc(size * size * sizeof(float));
        float *B = malloc(size * size * sizeof(float));
        float *C = malloc(size * size * sizeof(float));
        
        // Initialize with random data
        for (int j = 0; j < size * size; j++) {
            A[j] = (float)rand() / RAND_MAX;
            B[j] = (float)rand() / RAND_MAX;
        }
        
        // Single GPU test
        printf("Single GPU test:\n");
        struct timespec start, end;
        clock_gettime(CLOCK_MONOTONIC, &start);
        
        // Perform matrix multiplication
        multi_gpu_matrix_multiply(A, B, C, size, size, size);
        
        clock_gettime(CLOCK_MONOTONIC, &end);
        
        double total_time = (end.tv_sec - start.tv_sec) + 
                           (end.tv_nsec - start.tv_nsec) / 1e9;
        double gflops = (2.0 * size * size * size) / (total_time * 1e9);
        
        printf("Time: %.3f seconds\n", total_time);
        printf("Performance: %.1f GFLOPS\n", gflops);
        
        free(A);
        free(B);
        free(C);
    }
    
    return 0;
}
EOF
    
    nvcc -o matrix_benchmark matrix_benchmark.cu \
        -lcuda -lcudart -lcublas -lcurand -lcufft -lcudnn -lnvml \
        -fopenmp -lm -lpthread
    
    echo "CUDA applications built successfully"
}

# Build OpenCL applications
build_opencl_applications() {
    echo "Building OpenCL applications..."
    
    cd "$BUILD_DIR"
    
    # Build OpenCL framework
    gcc -o opencl_framework opencl_framework.c \
        -lOpenCL -lm -lpthread
    
    # Create OpenCL benchmark
    cat > opencl_benchmark.c << 'EOF'
#include "opencl_framework.c"

int main() {
    if (init_opencl_framework() < 0) {
        return 1;
    }
    
    // Test each available device
    for (int i = 0; i < ocl_fw.num_devices; i++) {
        if (create_opencl_context(i) < 0) {
            continue;
        }
        
        if (build_opencl_program(i, matrix_multiply_kernel) < 0) {
            continue;
        }
        
        benchmark_opencl_device(i);
    }
    
    return 0;
}
EOF
    
    gcc -o opencl_benchmark opencl_benchmark.c \
        -lOpenCL -lm -lpthread
    
    echo "OpenCL applications built successfully"
}

# Build MPI applications
build_mpi_applications() {
    echo "Building MPI applications..."
    
    cd "$BUILD_DIR"
    
    # Build MPI+GPU framework
    mpicc -o mpi_gpu_framework mpi_gpu_framework.c \
        -lcuda -lcudart -lcublas -lnccl \
        -fopenmp -lm -lpthread
    
    echo "MPI applications built successfully"
}

# Run comprehensive tests
run_tests() {
    echo "Running HPC/GPU tests..."
    
    cd "$BUILD_DIR"
    
    # Test CUDA framework
    echo "=== Testing CUDA Framework ==="
    if command -v nvidia-smi &> /dev/null; then
        nvidia-smi
        ./matrix_benchmark
    else
        echo "NVIDIA GPU not available, skipping CUDA tests"
    fi
    
    # Test OpenCL framework
    echo -e "\n=== Testing OpenCL Framework ==="
    ./opencl_benchmark
    
    # Test MPI framework (single node)
    echo -e "\n=== Testing MPI+GPU Framework ==="
    if command -v mpirun &> /dev/null; then
        mpirun -np 2 ./mpi_gpu_framework
    else
        echo "MPI not available, skipping MPI tests"
    fi
}

# Performance benchmarking
run_benchmarks() {
    echo "Running performance benchmarks..."
    
    cd "$BUILD_DIR"
    
    # GPU Memory bandwidth test
    cat > memory_bandwidth_test.cu << 'EOF'
#include <cuda_runtime.h>
#include <stdio.h>
#include <time.h>

int main() {
    int device_count;
    cudaGetDeviceCount(&device_count);
    
    for (int dev = 0; dev < device_count; dev++) {
        cudaSetDevice(dev);
        
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, dev);
        
        printf("\n=== Device %d: %s ===\n", dev, prop.name);
        
        size_t size = 256 * 1024 * 1024; // 256 MB
        float *h_data = malloc(size);
        float *d_data;
        
        cudaMalloc(&d_data, size);
        
        // Initialize host data
        for (size_t i = 0; i < size/sizeof(float); i++) {
            h_data[i] = (float)i;
        }
        
        // Benchmark host to device transfer
        cudaEvent_t start, stop;
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        
        cudaEventRecord(start);
        for (int i = 0; i < 10; i++) {
            cudaMemcpy(d_data, h_data, size, cudaMemcpyHostToDevice);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        
        float h2d_time;
        cudaEventElapsedTime(&h2d_time, start, stop);
        
        // Benchmark device to host transfer
        cudaEventRecord(start);
        for (int i = 0; i < 10; i++) {
            cudaMemcpy(h_data, d_data, size, cudaMemcpyDeviceToHost);
        }
        cudaEventRecord(stop);
        cudaEventSynchronize(stop);
        
        float d2h_time;
        cudaEventElapsedTime(&d2h_time, start, stop);
        
        double h2d_bandwidth = (size * 10) / (h2d_time / 1000.0) / 1e9;
        double d2h_bandwidth = (size * 10) / (d2h_time / 1000.0) / 1e9;
        
        printf("Host to Device: %.1f GB/s\n", h2d_bandwidth);
        printf("Device to Host: %.1f GB/s\n", d2h_bandwidth);
        
        cudaFree(d_data);
        free(h_data);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }
    
    return 0;
}
EOF
    
    nvcc -o memory_bandwidth_test memory_bandwidth_test.cu
    ./memory_bandwidth_test
    
    # CPU vs GPU comparison
    echo -e "\n=== CPU vs GPU Performance Comparison ==="
    
    cat > cpu_gpu_comparison.c << 'EOF'
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <omp.h>

void cpu_matrix_multiply(float *A, float *B, float *C, int size) {
    #pragma omp parallel for
    for (int i = 0; i < size; i++) {
        for (int j = 0; j < size; j++) {
            float sum = 0.0f;
            for (int k = 0; k < size; k++) {
                sum += A[i * size + k] * B[k * size + j];
            }
            C[i * size + j] = sum;
        }
    }
}

int main() {
    int size = 1024;
    
    float *A = malloc(size * size * sizeof(float));
    float *B = malloc(size * size * sizeof(float));
    float *C = malloc(size * size * sizeof(float));
    
    // Initialize matrices
    for (int i = 0; i < size * size; i++) {
        A[i] = (float)rand() / RAND_MAX;
        B[i] = (float)rand() / RAND_MAX;
    }
    
    // CPU benchmark
    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);
    
    cpu_matrix_multiply(A, B, C, size);
    
    clock_gettime(CLOCK_MONOTONIC, &end);
    
    double cpu_time = (end.tv_sec - start.tv_sec) + 
                     (end.tv_nsec - start.tv_nsec) / 1e9;
    double cpu_gflops = (2.0 * size * size * size) / (cpu_time * 1e9);
    
    printf("CPU Performance:\n");
    printf("  Time: %.3f seconds\n", cpu_time);
    printf("  GFLOPS: %.1f\n", cpu_gflops);
    printf("  Threads: %d\n", omp_get_max_threads());
    
    free(A);
    free(B);
    free(C);
    
    return 0;
}
EOF
    
    gcc -o cpu_gpu_comparison cpu_gpu_comparison.c -fopenmp -lm
    ./cpu_gpu_comparison
}

# Generate performance report
generate_report() {
    local report_file="$BUILD_DIR/performance_report.html"
    
    echo "Generating performance report..."
    
    cat > "$report_file" << 'EOF'
<!DOCTYPE html>
<html>
<head>
    <title>HPC/GPU Performance Report</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        .section { margin: 20px 0; padding: 15px; border: 1px solid #ddd; }
        .metric { margin: 10px 0; }
        table { border-collapse: collapse; width: 100%; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #f2f2f2; }
        .good { color: green; }
        .warning { color: orange; }
        .poor { color: red; }
    </style>
</head>
<body>
    <h1>HPC/GPU Performance Analysis Report</h1>
    
    <div class="section">
        <h2>System Information</h2>
        <div class="metric">Generated: <script>document.write(new Date())</script></div>
        <div class="metric">Hostname: <span id="hostname">Loading...</span></div>
        <div class="metric">CUDA Version: <span id="cuda-version">Loading...</span></div>
        <div class="metric">OpenCL Platforms: <span id="opencl-platforms">Loading...</span></div>
    </div>
    
    <div class="section">
        <h2>GPU Performance Metrics</h2>
        <table>
            <tr>
                <th>Metric</th>
                <th>Value</th>
                <th>Status</th>
            </tr>
            <tr>
                <td>Matrix Multiplication (1024x1024)</td>
                <td id="matrix-perf">Loading...</td>
                <td id="matrix-status">Loading...</td>
            </tr>
            <tr>
                <td>Memory Bandwidth H2D</td>
                <td id="mem-h2d">Loading...</td>
                <td id="mem-h2d-status">Loading...</td>
            </tr>
            <tr>
                <td>Memory Bandwidth D2H</td>
                <td id="mem-d2h">Loading...</td>
                <td id="mem-d2h-status">Loading...</td>
            </tr>
        </table>
    </div>
    
    <div class="section">
        <h2>Optimization Recommendations</h2>
        <ul id="recommendations">
            <li>Enable GPU boost clocks for maximum performance</li>
            <li>Use pinned memory for faster CPU-GPU transfers</li>
            <li>Optimize kernel launch parameters using occupancy API</li>
            <li>Consider using multiple streams for overlapping computation and communication</li>
            <li>Implement memory pooling to reduce allocation overhead</li>
        </ul>
    </div>
    
    <div class="section">
        <h2>Test Results</h2>
        <div id="test-results">
            <p>Test results will be populated from benchmark outputs...</p>
        </div>
    </div>
</body>
</html>
EOF
    
    echo "Performance report generated: $report_file"
    echo "Open in browser: file://$report_file"
}

# Main execution
main() {
    case "${1:-help}" in
        setup)
            setup_environment
            ;;
        build-cuda)
            build_cuda_applications
            ;;
        build-opencl)
            build_opencl_applications
            ;;
        build-mpi)
            build_mpi_applications
            ;;
        build-all)
            setup_environment
            build_cuda_applications
            build_opencl_applications
            build_mpi_applications
            ;;
        test)
            run_tests
            ;;
        benchmark)
            run_benchmarks
            ;;
        report)
            generate_report
            ;;
        all)
            setup_environment
            build_cuda_applications
            build_opencl_applications
            build_mpi_applications
            run_tests
            run_benchmarks
            generate_report
            ;;
        *)
            echo "Usage: $0 {setup|build-cuda|build-opencl|build-mpi|build-all|test|benchmark|report|all}"
            echo ""
            echo "Commands:"
            echo "  setup        - Setup HPC/GPU development environment"
            echo "  build-cuda   - Build CUDA applications"
            echo "  build-opencl - Build OpenCL applications" 
            echo "  build-mpi    - Build MPI applications"
            echo "  build-all    - Build all applications"
            echo "  test         - Run functional tests"
            echo "  benchmark    - Run performance benchmarks"
            echo "  report       - Generate performance report"
            echo "  all          - Run complete workflow"
            ;;
    esac
}

main "$@"

This comprehensive HPC and GPU programming guide demonstrates:

Advanced CUDA programming with multi-GPU support and optimization techniques
Cross-platform OpenCL development for heterogeneous computing
MPI integration for distributed GPU cluster computing
NCCL for efficient GPU-to-GPU communication
Performance optimization and benchmarking frameworks
Production-ready build and test automation

The implementations showcase real-world HPC techniques used in scientific computing, machine learning, and high-performance applications.

<function_calls> [{“content”: “Spread existing blog posts across multiple weeks with proper dates”, “status”: “completed”, “priority”: “high”, “id”: “1”}, {“content”: “Create 20 additional advanced Linux/systems programming blog posts”, “status”: “completed”, “priority”: “high”, “id”: “2”}, {“content”: “Create 100 more advanced Linux/systems programming blog posts”, “status”: “in_progress”, “priority”: “high”, “id”: “3”}]

Support Tools

High-Performance Computing and GPU Programming on Linux: CUDA, OpenCL, and Parallel Computing Mastery