Advanced Linux graphics and GPU programming requires deep understanding of modern graphics APIs, parallel computing architectures, and optimization techniques. This comprehensive guide explores building high-performance applications using Vulkan, CUDA, OpenCL, and advanced rendering techniques for both graphics and general-purpose GPU computing.

Advanced Linux Graphics and GPU Programming

Vulkan High-Performance Rendering Engine

Advanced Vulkan Graphics Framework

// vulkan_engine.c - Advanced Vulkan graphics engine implementation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <math.h>

#include <vulkan/vulkan.h>
#include <GLFW/glfw3.h>
#include <cglm/cglm.h>

#define MAX_FRAMES_IN_FLIGHT 2
#define MAX_DESCRIPTOR_SETS 1000
#define MAX_BUFFERS 1000
#define MAX_TEXTURES 1000
#define MAX_PIPELINES 100
#define MAX_RENDER_PASSES 10

// Vulkan engine structures
typedef struct {
    VkInstance instance;
    VkPhysicalDevice physical_device;
    VkDevice device;
    VkQueue graphics_queue;
    VkQueue present_queue;
    VkQueue compute_queue;
    VkCommandPool command_pool;
    VkCommandPool compute_command_pool;
    
    // Surface and swapchain
    VkSurfaceKHR surface;
    VkSwapchainKHR swapchain;
    VkFormat swapchain_image_format;
    VkExtent2D swapchain_extent;
    VkImage *swapchain_images;
    VkImageView *swapchain_image_views;
    uint32_t swapchain_image_count;
    
    // Synchronization
    VkSemaphore image_available_semaphores[MAX_FRAMES_IN_FLIGHT];
    VkSemaphore render_finished_semaphores[MAX_FRAMES_IN_FLIGHT];
    VkFence in_flight_fences[MAX_FRAMES_IN_FLIGHT];
    
    // Memory management
    VkDeviceMemory device_memory_blocks[MAX_BUFFERS];
    uint32_t memory_block_count;
    
    // Descriptor management
    VkDescriptorPool descriptor_pool;
    VkDescriptorSet descriptor_sets[MAX_DESCRIPTOR_SETS];
    uint32_t descriptor_set_count;
    
    // Pipeline management
    VkRenderPass render_passes[MAX_RENDER_PASSES];
    VkPipeline pipelines[MAX_PIPELINES];
    VkPipelineLayout pipeline_layouts[MAX_PIPELINES];
    uint32_t pipeline_count;
    
    // Command buffers
    VkCommandBuffer command_buffers[MAX_FRAMES_IN_FLIGHT];
    VkCommandBuffer compute_command_buffers[MAX_FRAMES_IN_FLIGHT];
    
    // Debug
    VkDebugUtilsMessengerEXT debug_messenger;
    
    // Frame data
    uint32_t current_frame;
    bool framebuffer_resized;
    
} vulkan_engine_t;

// Buffer management
typedef struct {
    VkBuffer buffer;
    VkDeviceMemory memory;
    VkDeviceSize size;
    void *mapped_data;
    VkBufferUsageFlags usage;
    VkMemoryPropertyFlags properties;
} vulkan_buffer_t;

// Texture management
typedef struct {
    VkImage image;
    VkDeviceMemory memory;
    VkImageView view;
    VkSampler sampler;
    VkFormat format;
    uint32_t width;
    uint32_t height;
    uint32_t mip_levels;
} vulkan_texture_t;

// Shader management
typedef struct {
    VkShaderModule vertex_shader;
    VkShaderModule fragment_shader;
    VkShaderModule geometry_shader;
    VkShaderModule compute_shader;
    
    VkPipelineShaderStageCreateInfo *stages;
    uint32_t stage_count;
} vulkan_shader_t;

// Vertex data structures
typedef struct {
    vec3 position;
    vec3 normal;
    vec2 tex_coord;
    vec4 color;
} vertex_t;

// Uniform buffer objects
typedef struct {
    mat4 model;
    mat4 view;
    mat4 projection;
    vec4 camera_position;
    vec4 light_position;
    vec4 light_color;
} uniform_buffer_object_t;

// Compute shader data
typedef struct {
    vec4 position;
    vec4 velocity;
    vec4 color;
    float life;
} particle_t;

// Function prototypes
int vulkan_engine_init(vulkan_engine_t *engine, GLFWwindow *window);
int vulkan_engine_cleanup(vulkan_engine_t *engine);
int vulkan_engine_draw_frame(vulkan_engine_t *engine);
int vulkan_engine_wait_idle(vulkan_engine_t *engine);

// Vulkan setup functions
int create_instance(vulkan_engine_t *engine);
int setup_debug_messenger(vulkan_engine_t *engine);
int create_surface(vulkan_engine_t *engine, GLFWwindow *window);
int pick_physical_device(vulkan_engine_t *engine);
int create_logical_device(vulkan_engine_t *engine);
int create_swapchain(vulkan_engine_t *engine);
int create_image_views(vulkan_engine_t *engine);
int create_render_pass(vulkan_engine_t *engine);
int create_descriptor_set_layout(vulkan_engine_t *engine);
int create_graphics_pipeline(vulkan_engine_t *engine);
int create_framebuffers(vulkan_engine_t *engine);
int create_command_pool(vulkan_engine_t *engine);
int create_vertex_buffer(vulkan_engine_t *engine);
int create_index_buffer(vulkan_engine_t *engine);
int create_uniform_buffers(vulkan_engine_t *engine);
int create_descriptor_pool(vulkan_engine_t *engine);
int create_descriptor_sets(vulkan_engine_t *engine);
int create_command_buffers(vulkan_engine_t *engine);
int create_sync_objects(vulkan_engine_t *engine);

// Buffer management functions
int create_buffer(vulkan_engine_t *engine, VkDeviceSize size, VkBufferUsageFlags usage, 
                 VkMemoryPropertyFlags properties, vulkan_buffer_t *buffer);
int copy_buffer(vulkan_engine_t *engine, vulkan_buffer_t *src, vulkan_buffer_t *dst, VkDeviceSize size);
void destroy_buffer(vulkan_engine_t *engine, vulkan_buffer_t *buffer);

// Texture management functions
int create_texture_image(vulkan_engine_t *engine, const char *filename, vulkan_texture_t *texture);
int create_texture_sampler(vulkan_engine_t *engine, vulkan_texture_t *texture);
void destroy_texture(vulkan_engine_t *engine, vulkan_texture_t *texture);

// Shader management functions
int load_shader_module(vulkan_engine_t *engine, const char *filename, VkShaderModule *shader_module);
int create_shader_stages(vulkan_engine_t *engine, vulkan_shader_t *shader);
void destroy_shader(vulkan_engine_t *engine, vulkan_shader_t *shader);

// Utility functions
uint32_t find_memory_type(vulkan_engine_t *engine, uint32_t type_filter, VkMemoryPropertyFlags properties);
VkFormat find_supported_format(vulkan_engine_t *engine, VkFormat *candidates, uint32_t candidate_count,
                               VkImageTiling tiling, VkFormatFeatureFlags features);
int has_stencil_component(VkFormat format);

// Debug callback
static VKAPI_ATTR VkBool32 VKAPI_CALL debug_callback(
    VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
    VkDebugUtilsMessageTypeFlagsEXT message_type,
    const VkDebugUtilsMessengerCallbackDataEXT *callback_data,
    void *user_data) {
    
    if (message_severity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) {
        fprintf(stderr, "Validation layer: %s\n", callback_data->pMessage);
    }
    
    return VK_FALSE;
}

// Global engine instance
static vulkan_engine_t g_engine;

int main(int argc, char *argv[]) {
    // Initialize GLFW
    if (!glfwInit()) {
        fprintf(stderr, "Failed to initialize GLFW\n");
        return -1;
    }
    
    // Create window
    glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
    glfwWindowHint(GLFW_RESIZABLE, GLFW_TRUE);
    
    GLFWwindow *window = glfwCreateWindow(1920, 1080, "Vulkan Engine", NULL, NULL);
    if (!window) {
        fprintf(stderr, "Failed to create GLFW window\n");
        glfwTerminate();
        return -1;
    }
    
    // Initialize Vulkan engine
    if (vulkan_engine_init(&g_engine, window) != 0) {
        fprintf(stderr, "Failed to initialize Vulkan engine\n");
        glfwDestroyWindow(window);
        glfwTerminate();
        return -1;
    }
    
    // Main render loop
    while (!glfwWindowShouldClose(window)) {
        glfwPollEvents();
        
        if (vulkan_engine_draw_frame(&g_engine) != 0) {
            fprintf(stderr, "Failed to draw frame\n");
            break;
        }
    }
    
    // Wait for device to be idle before cleanup
    vulkan_engine_wait_idle(&g_engine);
    
    // Cleanup
    vulkan_engine_cleanup(&g_engine);
    glfwDestroyWindow(window);
    glfwTerminate();
    
    return 0;
}

int vulkan_engine_init(vulkan_engine_t *engine, GLFWwindow *window) {
    if (!engine || !window) return -1;
    
    memset(engine, 0, sizeof(vulkan_engine_t));
    
    // Create Vulkan instance
    if (create_instance(engine) != 0) {
        fprintf(stderr, "Failed to create Vulkan instance\n");
        return -1;
    }
    
    // Setup debug messenger
    if (setup_debug_messenger(engine) != 0) {
        fprintf(stderr, "Failed to setup debug messenger\n");
        return -1;
    }
    
    // Create surface
    if (create_surface(engine, window) != 0) {
        fprintf(stderr, "Failed to create surface\n");
        return -1;
    }
    
    // Pick physical device
    if (pick_physical_device(engine) != 0) {
        fprintf(stderr, "Failed to pick physical device\n");
        return -1;
    }
    
    // Create logical device
    if (create_logical_device(engine) != 0) {
        fprintf(stderr, "Failed to create logical device\n");
        return -1;
    }
    
    // Create swapchain
    if (create_swapchain(engine) != 0) {
        fprintf(stderr, "Failed to create swapchain\n");
        return -1;
    }
    
    // Create image views
    if (create_image_views(engine) != 0) {
        fprintf(stderr, "Failed to create image views\n");
        return -1;
    }
    
    // Create render pass
    if (create_render_pass(engine) != 0) {
        fprintf(stderr, "Failed to create render pass\n");
        return -1;
    }
    
    // Create descriptor set layout
    if (create_descriptor_set_layout(engine) != 0) {
        fprintf(stderr, "Failed to create descriptor set layout\n");
        return -1;
    }
    
    // Create graphics pipeline
    if (create_graphics_pipeline(engine) != 0) {
        fprintf(stderr, "Failed to create graphics pipeline\n");
        return -1;
    }
    
    // Create command pool
    if (create_command_pool(engine) != 0) {
        fprintf(stderr, "Failed to create command pool\n");
        return -1;
    }
    
    // Create vertex buffer
    if (create_vertex_buffer(engine) != 0) {
        fprintf(stderr, "Failed to create vertex buffer\n");
        return -1;
    }
    
    // Create uniform buffers
    if (create_uniform_buffers(engine) != 0) {
        fprintf(stderr, "Failed to create uniform buffers\n");
        return -1;
    }
    
    // Create descriptor pool
    if (create_descriptor_pool(engine) != 0) {
        fprintf(stderr, "Failed to create descriptor pool\n");
        return -1;
    }
    
    // Create descriptor sets
    if (create_descriptor_sets(engine) != 0) {
        fprintf(stderr, "Failed to create descriptor sets\n");
        return -1;
    }
    
    // Create command buffers
    if (create_command_buffers(engine) != 0) {
        fprintf(stderr, "Failed to create command buffers\n");
        return -1;
    }
    
    // Create synchronization objects
    if (create_sync_objects(engine) != 0) {
        fprintf(stderr, "Failed to create sync objects\n");
        return -1;
    }
    
    printf("Vulkan engine initialized successfully\n");
    return 0;
}

int vulkan_engine_draw_frame(vulkan_engine_t *engine) {
    if (!engine) return -1;
    
    // Wait for previous frame to finish
    vkWaitForFences(engine->device, 1, &engine->in_flight_fences[engine->current_frame], VK_TRUE, UINT64_MAX);
    
    // Acquire next image from swapchain
    uint32_t image_index;
    VkResult result = vkAcquireNextImageKHR(engine->device, engine->swapchain, UINT64_MAX,
                                           engine->image_available_semaphores[engine->current_frame],
                                           VK_NULL_HANDLE, &image_index);
    
    if (result == VK_ERROR_OUT_OF_DATE_KHR) {
        // Recreate swapchain
        return 0;
    } else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
        fprintf(stderr, "Failed to acquire swapchain image\n");
        return -1;
    }
    
    // Reset fence
    vkResetFences(engine->device, 1, &engine->in_flight_fences[engine->current_frame]);
    
    // Reset command buffer
    vkResetCommandBuffer(engine->command_buffers[engine->current_frame], 0);
    
    // Record command buffer
    VkCommandBufferBeginInfo begin_info = {
        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
        .flags = 0,
        .pInheritanceInfo = NULL
    };
    
    if (vkBeginCommandBuffer(engine->command_buffers[engine->current_frame], &begin_info) != VK_SUCCESS) {
        fprintf(stderr, "Failed to begin recording command buffer\n");
        return -1;
    }
    
    // Begin render pass
    VkRenderPassBeginInfo render_pass_info = {
        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
        .renderPass = engine->render_passes[0],
        .framebuffer = engine->swapchain_framebuffers[image_index],
        .renderArea.offset = {0, 0},
        .renderArea.extent = engine->swapchain_extent
    };
    
    VkClearValue clear_values[2] = {
        {.color = {{0.0f, 0.0f, 0.0f, 1.0f}}},
        {.depthStencil = {1.0f, 0}}
    };
    
    render_pass_info.clearValueCount = 2;
    render_pass_info.pClearValues = clear_values;
    
    vkCmdBeginRenderPass(engine->command_buffers[engine->current_frame], &render_pass_info, VK_SUBPASS_CONTENTS_INLINE);
    
    // Bind graphics pipeline
    vkCmdBindPipeline(engine->command_buffers[engine->current_frame], VK_PIPELINE_BIND_POINT_GRAPHICS, engine->pipelines[0]);
    
    // Set viewport
    VkViewport viewport = {
        .x = 0.0f,
        .y = 0.0f,
        .width = (float)engine->swapchain_extent.width,
        .height = (float)engine->swapchain_extent.height,
        .minDepth = 0.0f,
        .maxDepth = 1.0f
    };
    vkCmdSetViewport(engine->command_buffers[engine->current_frame], 0, 1, &viewport);
    
    // Set scissor
    VkRect2D scissor = {
        .offset = {0, 0},
        .extent = engine->swapchain_extent
    };
    vkCmdSetScissor(engine->command_buffers[engine->current_frame], 0, 1, &scissor);
    
    // Bind vertex buffer
    VkBuffer vertex_buffers[] = {engine->vertex_buffer.buffer};
    VkDeviceSize offsets[] = {0};
    vkCmdBindVertexBuffers(engine->command_buffers[engine->current_frame], 0, 1, vertex_buffers, offsets);
    
    // Bind index buffer
    vkCmdBindIndexBuffer(engine->command_buffers[engine->current_frame], engine->index_buffer.buffer, 0, VK_INDEX_TYPE_UINT16);
    
    // Bind descriptor sets
    vkCmdBindDescriptorSets(engine->command_buffers[engine->current_frame], VK_PIPELINE_BIND_POINT_GRAPHICS,
                           engine->pipeline_layouts[0], 0, 1, &engine->descriptor_sets[engine->current_frame], 0, NULL);
    
    // Draw indexed
    vkCmdDrawIndexed(engine->command_buffers[engine->current_frame], engine->index_count, 1, 0, 0, 0);
    
    // End render pass
    vkCmdEndRenderPass(engine->command_buffers[engine->current_frame]);
    
    // End command buffer
    if (vkEndCommandBuffer(engine->command_buffers[engine->current_frame]) != VK_SUCCESS) {
        fprintf(stderr, "Failed to record command buffer\n");
        return -1;
    }
    
    // Submit command buffer
    VkSubmitInfo submit_info = {
        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO
    };
    
    VkSemaphore wait_semaphores[] = {engine->image_available_semaphores[engine->current_frame]};
    VkPipelineStageFlags wait_stages[] = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT};
    submit_info.waitSemaphoreCount = 1;
    submit_info.pWaitSemaphores = wait_semaphores;
    submit_info.pWaitDstStageMask = wait_stages;
    submit_info.commandBufferCount = 1;
    submit_info.pCommandBuffers = &engine->command_buffers[engine->current_frame];
    
    VkSemaphore signal_semaphores[] = {engine->render_finished_semaphores[engine->current_frame]};
    submit_info.signalSemaphoreCount = 1;
    submit_info.pSignalSemaphores = signal_semaphores;
    
    if (vkQueueSubmit(engine->graphics_queue, 1, &submit_info, engine->in_flight_fences[engine->current_frame]) != VK_SUCCESS) {
        fprintf(stderr, "Failed to submit draw command buffer\n");
        return -1;
    }
    
    // Present result
    VkPresentInfoKHR present_info = {
        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
        .waitSemaphoreCount = 1,
        .pWaitSemaphores = signal_semaphores,
        .swapchainCount = 1,
        .pSwapchains = &engine->swapchain,
        .pImageIndices = &image_index,
        .pResults = NULL
    };
    
    result = vkQueuePresentKHR(engine->present_queue, &present_info);
    
    if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || engine->framebuffer_resized) {
        engine->framebuffer_resized = false;
        // Recreate swapchain
        return 0;
    } else if (result != VK_SUCCESS) {
        fprintf(stderr, "Failed to present swap chain image\n");
        return -1;
    }
    
    // Advance to next frame
    engine->current_frame = (engine->current_frame + 1) % MAX_FRAMES_IN_FLIGHT;
    
    return 0;
}

int create_instance(vulkan_engine_t *engine) {
    // Application info
    VkApplicationInfo app_info = {
        .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
        .pApplicationName = "Vulkan Engine",
        .applicationVersion = VK_MAKE_VERSION(1, 0, 0),
        .pEngineName = "Custom Engine",
        .engineVersion = VK_MAKE_VERSION(1, 0, 0),
        .apiVersion = VK_API_VERSION_1_2
    };
    
    // Get required extensions
    uint32_t glfw_extension_count = 0;
    const char **glfw_extensions = glfwGetRequiredInstanceExtensions(&glfw_extension_count);
    
    // Add debug extension
    const char *extensions[glfw_extension_count + 1];
    for (uint32_t i = 0; i < glfw_extension_count; i++) {
        extensions[i] = glfw_extensions[i];
    }
    extensions[glfw_extension_count] = VK_EXT_DEBUG_UTILS_EXTENSION_NAME;
    
    // Validation layers
    const char *validation_layers[] = {
        "VK_LAYER_KHRONOS_validation"
    };
    
    // Instance create info
    VkInstanceCreateInfo create_info = {
        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
        .pApplicationInfo = &app_info,
        .enabledExtensionCount = glfw_extension_count + 1,
        .ppEnabledExtensionNames = extensions,
        .enabledLayerCount = 1,
        .ppEnabledLayerNames = validation_layers
    };
    
    // Create instance
    if (vkCreateInstance(&create_info, NULL, &engine->instance) != VK_SUCCESS) {
        fprintf(stderr, "Failed to create Vulkan instance\n");
        return -1;
    }
    
    return 0;
}

int create_buffer(vulkan_engine_t *engine, VkDeviceSize size, VkBufferUsageFlags usage,
                 VkMemoryPropertyFlags properties, vulkan_buffer_t *buffer) {
    if (!engine || !buffer) return -1;
    
    // Create buffer
    VkBufferCreateInfo buffer_info = {
        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        .size = size,
        .usage = usage,
        .sharingMode = VK_SHARING_MODE_EXCLUSIVE
    };
    
    if (vkCreateBuffer(engine->device, &buffer_info, NULL, &buffer->buffer) != VK_SUCCESS) {
        fprintf(stderr, "Failed to create buffer\n");
        return -1;
    }
    
    // Get memory requirements
    VkMemoryRequirements mem_requirements;
    vkGetBufferMemoryRequirements(engine->device, buffer->buffer, &mem_requirements);
    
    // Allocate memory
    VkMemoryAllocateInfo alloc_info = {
        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
        .allocationSize = mem_requirements.size,
        .memoryTypeIndex = find_memory_type(engine, mem_requirements.memoryTypeBits, properties)
    };
    
    if (vkAllocateMemory(engine->device, &alloc_info, NULL, &buffer->memory) != VK_SUCCESS) {
        fprintf(stderr, "Failed to allocate buffer memory\n");
        vkDestroyBuffer(engine->device, buffer->buffer, NULL);
        return -1;
    }
    
    // Bind buffer memory
    vkBindBufferMemory(engine->device, buffer->buffer, buffer->memory, 0);
    
    // Store buffer info
    buffer->size = size;
    buffer->usage = usage;
    buffer->properties = properties;
    
    return 0;
}

uint32_t find_memory_type(vulkan_engine_t *engine, uint32_t type_filter, VkMemoryPropertyFlags properties) {
    VkPhysicalDeviceMemoryProperties mem_properties;
    vkGetPhysicalDeviceMemoryProperties(engine->physical_device, &mem_properties);
    
    for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++) {
        if ((type_filter & (1 << i)) && (mem_properties.memoryTypes[i].propertyFlags & properties) == properties) {
            return i;
        }
    }
    
    fprintf(stderr, "Failed to find suitable memory type\n");
    return 0;
}

int vulkan_engine_cleanup(vulkan_engine_t *engine) {
    if (!engine) return -1;
    
    // Cleanup synchronization objects
    for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
        vkDestroySemaphore(engine->device, engine->render_finished_semaphores[i], NULL);
        vkDestroySemaphore(engine->device, engine->image_available_semaphores[i], NULL);
        vkDestroyFence(engine->device, engine->in_flight_fences[i], NULL);
    }
    
    // Cleanup command pool
    vkDestroyCommandPool(engine->device, engine->command_pool, NULL);
    
    // Cleanup buffers
    destroy_buffer(engine, &engine->vertex_buffer);
    destroy_buffer(engine, &engine->index_buffer);
    
    // Cleanup descriptor pool
    vkDestroyDescriptorPool(engine->device, engine->descriptor_pool, NULL);
    
    // Cleanup pipelines
    for (uint32_t i = 0; i < engine->pipeline_count; i++) {
        vkDestroyPipeline(engine->device, engine->pipelines[i], NULL);
        vkDestroyPipelineLayout(engine->device, engine->pipeline_layouts[i], NULL);
    }
    
    // Cleanup render passes
    for (uint32_t i = 0; i < MAX_RENDER_PASSES; i++) {
        if (engine->render_passes[i] != VK_NULL_HANDLE) {
            vkDestroyRenderPass(engine->device, engine->render_passes[i], NULL);
        }
    }
    
    // Cleanup swapchain
    for (uint32_t i = 0; i < engine->swapchain_image_count; i++) {
        vkDestroyImageView(engine->device, engine->swapchain_image_views[i], NULL);
    }
    free(engine->swapchain_images);
    free(engine->swapchain_image_views);
    vkDestroySwapchainKHR(engine->device, engine->swapchain, NULL);
    
    // Cleanup device
    vkDestroyDevice(engine->device, NULL);
    
    // Cleanup debug messenger
    if (engine->debug_messenger != VK_NULL_HANDLE) {
        PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)
            vkGetInstanceProcAddr(engine->instance, "vkDestroyDebugUtilsMessengerEXT");
        if (func != NULL) {
            func(engine->instance, engine->debug_messenger, NULL);
        }
    }
    
    // Cleanup surface
    vkDestroySurfaceKHR(engine->instance, engine->surface, NULL);
    
    // Cleanup instance
    vkDestroyInstance(engine->instance, NULL);
    
    printf("Vulkan engine cleanup completed\n");
    return 0;
}

int vulkan_engine_wait_idle(vulkan_engine_t *engine) {
    if (!engine) return -1;
    return vkDeviceWaitIdle(engine->device);
}

CUDA Parallel Computing Framework

// cuda_compute.c - Advanced CUDA parallel computing framework
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <curand.h>
#include <cufft.h>

#define CUDA_CHECK(call) \
    do { \
        cudaError_t err = call; \
        if (err != cudaSuccess) { \
            fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
            exit(1); \
        } \
    } while(0)

#define BLOCK_SIZE 256
#define GRID_SIZE 128
#define MAX_DEVICES 8

// CUDA compute context
typedef struct {
    int device_count;
    int current_device;
    cudaDeviceProp device_props[MAX_DEVICES];
    cudaStream_t streams[MAX_DEVICES];
    cublasHandle_t cublas_handles[MAX_DEVICES];
    curandGenerator_t curand_generators[MAX_DEVICES];
    
    // Memory pools
    void *device_memory_pool;
    size_t pool_size;
    size_t pool_used;
    
    // Performance metrics
    cudaEvent_t start_event;
    cudaEvent_t stop_event;
    float last_kernel_time;
    
} cuda_context_t;

// Matrix operations
typedef struct {
    float *data;
    int rows;
    int cols;
    size_t pitch;
    bool on_device;
} matrix_t;

// Vector operations
typedef struct {
    float *data;
    int size;
    bool on_device;
} vector_t;

// Function prototypes
int cuda_init(cuda_context_t *ctx);
int cuda_cleanup(cuda_context_t *ctx);
int cuda_select_device(cuda_context_t *ctx, int device_id);

// Memory management
int cuda_allocate_matrix(cuda_context_t *ctx, matrix_t *matrix, int rows, int cols);
int cuda_allocate_vector(cuda_context_t *ctx, vector_t *vector, int size);
int cuda_copy_matrix_to_device(cuda_context_t *ctx, matrix_t *dst, matrix_t *src);
int cuda_copy_matrix_to_host(cuda_context_t *ctx, matrix_t *dst, matrix_t *src);
int cuda_copy_vector_to_device(cuda_context_t *ctx, vector_t *dst, vector_t *src);
int cuda_copy_vector_to_host(cuda_context_t *ctx, vector_t *dst, vector_t *src);
void cuda_free_matrix(cuda_context_t *ctx, matrix_t *matrix);
void cuda_free_vector(cuda_context_t *ctx, vector_t *vector);

// CUDA kernels
__global__ void vector_add_kernel(float *a, float *b, float *c, int n);
__global__ void vector_scale_kernel(float *a, float scale, int n);
__global__ void matrix_multiply_kernel(float *a, float *b, float *c, int m, int n, int k);
__global__ void matrix_transpose_kernel(float *input, float *output, int rows, int cols);
__global__ void convolution_2d_kernel(float *input, float *kernel, float *output, 
                                     int input_width, int input_height, int kernel_size);
__global__ void fft_preprocess_kernel(float *input, float *output, int size);
__global__ void reduce_sum_kernel(float *input, float *output, int n);

// High-level operations
int cuda_vector_add(cuda_context_t *ctx, vector_t *a, vector_t *b, vector_t *result);
int cuda_vector_scale(cuda_context_t *ctx, vector_t *vector, float scale);
int cuda_matrix_multiply(cuda_context_t *ctx, matrix_t *a, matrix_t *b, matrix_t *result);
int cuda_matrix_transpose(cuda_context_t *ctx, matrix_t *input, matrix_t *output);
int cuda_convolution_2d(cuda_context_t *ctx, matrix_t *input, matrix_t *kernel, matrix_t *output);
int cuda_fft_1d(cuda_context_t *ctx, vector_t *input, vector_t *output);
int cuda_reduce_sum(cuda_context_t *ctx, vector_t *input, float *result);

// Performance utilities
float cuda_benchmark_kernel(cuda_context_t *ctx, void (*kernel_func)(void), int iterations);
int cuda_profile_memory_bandwidth(cuda_context_t *ctx);
int cuda_profile_compute_throughput(cuda_context_t *ctx);

// Global context
static cuda_context_t g_cuda_ctx;

int main(int argc, char *argv[]) {
    // Initialize CUDA
    if (cuda_init(&g_cuda_ctx) != 0) {
        fprintf(stderr, "Failed to initialize CUDA\n");
        return -1;
    }
    
    printf("CUDA initialized with %d devices\n", g_cuda_ctx.device_count);
    
    // Example: Large matrix multiplication
    matrix_t a, b, c;
    int matrix_size = 2048;
    
    // Allocate matrices
    cuda_allocate_matrix(&g_cuda_ctx, &a, matrix_size, matrix_size);
    cuda_allocate_matrix(&g_cuda_ctx, &b, matrix_size, matrix_size);
    cuda_allocate_matrix(&g_cuda_ctx, &c, matrix_size, matrix_size);
    
    // Initialize matrices with random data
    for (int i = 0; i < matrix_size * matrix_size; i++) {
        a.data[i] = (float)rand() / RAND_MAX;
        b.data[i] = (float)rand() / RAND_MAX;
    }
    
    // Copy to device
    cuda_copy_matrix_to_device(&g_cuda_ctx, &a, &a);
    cuda_copy_matrix_to_device(&g_cuda_ctx, &b, &b);
    
    // Perform matrix multiplication
    clock_t start = clock();
    cuda_matrix_multiply(&g_cuda_ctx, &a, &b, &c);
    CUDA_CHECK(cudaDeviceSynchronize());
    clock_t end = clock();
    
    double elapsed_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    double gflops = (2.0 * matrix_size * matrix_size * matrix_size) / (elapsed_time * 1e9);
    
    printf("Matrix multiplication (%dx%d): %.3f seconds, %.2f GFLOPS\n", 
           matrix_size, matrix_size, elapsed_time, gflops);
    
    // Example: Vector operations
    vector_t vec_a, vec_b, vec_result;
    int vector_size = 1000000;
    
    cuda_allocate_vector(&g_cuda_ctx, &vec_a, vector_size);
    cuda_allocate_vector(&g_cuda_ctx, &vec_b, vector_size);
    cuda_allocate_vector(&g_cuda_ctx, &vec_result, vector_size);
    
    // Initialize vectors
    for (int i = 0; i < vector_size; i++) {
        vec_a.data[i] = (float)i;
        vec_b.data[i] = (float)(i * 2);
    }
    
    // Copy to device and perform operations
    cuda_copy_vector_to_device(&g_cuda_ctx, &vec_a, &vec_a);
    cuda_copy_vector_to_device(&g_cuda_ctx, &vec_b, &vec_b);
    
    cuda_vector_add(&g_cuda_ctx, &vec_a, &vec_b, &vec_result);
    cuda_vector_scale(&g_cuda_ctx, &vec_result, 0.5f);
    
    // Reduce sum
    float sum_result;
    cuda_reduce_sum(&g_cuda_ctx, &vec_result, &sum_result);
    
    printf("Vector sum result: %.2f\n", sum_result);
    
    // Example: 2D convolution
    matrix_t image, filter, convolved;
    int image_size = 1024;
    int filter_size = 5;
    
    cuda_allocate_matrix(&g_cuda_ctx, &image, image_size, image_size);
    cuda_allocate_matrix(&g_cuda_ctx, &filter, filter_size, filter_size);
    cuda_allocate_matrix(&g_cuda_ctx, &convolved, image_size, image_size);
    
    // Initialize image and filter
    for (int i = 0; i < image_size * image_size; i++) {
        image.data[i] = (float)rand() / RAND_MAX;
    }
    
    // Gaussian filter
    float sigma = 1.0f;
    for (int i = 0; i < filter_size; i++) {
        for (int j = 0; j < filter_size; j++) {
            int x = i - filter_size / 2;
            int y = j - filter_size / 2;
            filter.data[i * filter_size + j] = expf(-(x*x + y*y) / (2 * sigma * sigma));
        }
    }
    
    // Copy to device and convolve
    cuda_copy_matrix_to_device(&g_cuda_ctx, &image, &image);
    cuda_copy_matrix_to_device(&g_cuda_ctx, &filter, &filter);
    
    start = clock();
    cuda_convolution_2d(&g_cuda_ctx, &image, &filter, &convolved);
    CUDA_CHECK(cudaDeviceSynchronize());
    end = clock();
    
    elapsed_time = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("2D convolution (%dx%d): %.3f seconds\n", image_size, image_size, elapsed_time);
    
    // Cleanup
    cuda_free_matrix(&g_cuda_ctx, &a);
    cuda_free_matrix(&g_cuda_ctx, &b);
    cuda_free_matrix(&g_cuda_ctx, &c);
    cuda_free_vector(&g_cuda_ctx, &vec_a);
    cuda_free_vector(&g_cuda_ctx, &vec_b);
    cuda_free_vector(&g_cuda_ctx, &vec_result);
    cuda_free_matrix(&g_cuda_ctx, &image);
    cuda_free_matrix(&g_cuda_ctx, &filter);
    cuda_free_matrix(&g_cuda_ctx, &convolved);
    
    cuda_cleanup(&g_cuda_ctx);
    
    return 0;
}

int cuda_init(cuda_context_t *ctx) {
    if (!ctx) return -1;
    
    memset(ctx, 0, sizeof(cuda_context_t));
    
    // Get device count
    CUDA_CHECK(cudaGetDeviceCount(&ctx->device_count));
    
    if (ctx->device_count == 0) {
        fprintf(stderr, "No CUDA devices found\n");
        return -1;
    }
    
    // Get device properties
    for (int i = 0; i < ctx->device_count; i++) {
        CUDA_CHECK(cudaGetDeviceProperties(&ctx->device_props[i], i));
        printf("Device %d: %s\n", i, ctx->device_props[i].name);
        printf("  Compute capability: %d.%d\n", 
               ctx->device_props[i].major, ctx->device_props[i].minor);
        printf("  Global memory: %zu MB\n", 
               ctx->device_props[i].totalGlobalMem / (1024 * 1024));
        printf("  Multiprocessors: %d\n", ctx->device_props[i].multiProcessorCount);
        printf("  Max threads per block: %d\n", ctx->device_props[i].maxThreadsPerBlock);
    }
    
    // Select best device
    cuda_select_device(ctx, 0);
    
    // Create streams
    for (int i = 0; i < ctx->device_count; i++) {
        CUDA_CHECK(cudaSetDevice(i));
        CUDA_CHECK(cudaStreamCreate(&ctx->streams[i]));
        
        // Create cuBLAS handle
        cublasCreate(&ctx->cublas_handles[i]);
        cublasSetStream(ctx->cublas_handles[i], ctx->streams[i]);
        
        // Create cuRAND generator
        curandCreateGenerator(&ctx->curand_generators[i], CURAND_RNG_PSEUDO_DEFAULT);
        curandSetStream(ctx->curand_generators[i], ctx->streams[i]);
    }
    
    // Create events for timing
    CUDA_CHECK(cudaEventCreate(&ctx->start_event));
    CUDA_CHECK(cudaEventCreate(&ctx->stop_event));
    
    return 0;
}

__global__ void vector_add_kernel(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}

__global__ void matrix_multiply_kernel(float *a, float *b, float *c, int m, int n, int k) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < m && col < n) {
        float sum = 0.0f;
        for (int i = 0; i < k; i++) {
            sum += a[row * k + i] * b[i * n + col];
        }
        c[row * n + col] = sum;
    }
}

__global__ void convolution_2d_kernel(float *input, float *kernel, float *output,
                                     int input_width, int input_height, int kernel_size) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (x < input_width && y < input_height) {
        float sum = 0.0f;
        int kernel_radius = kernel_size / 2;
        
        for (int ky = -kernel_radius; ky <= kernel_radius; ky++) {
            for (int kx = -kernel_radius; kx <= kernel_radius; kx++) {
                int input_x = x + kx;
                int input_y = y + ky;
                
                if (input_x >= 0 && input_x < input_width && 
                    input_y >= 0 && input_y < input_height) {
                    int input_idx = input_y * input_width + input_x;
                    int kernel_idx = (ky + kernel_radius) * kernel_size + (kx + kernel_radius);
                    sum += input[input_idx] * kernel[kernel_idx];
                }
            }
        }
        
        output[y * input_width + x] = sum;
    }
}

int cuda_vector_add(cuda_context_t *ctx, vector_t *a, vector_t *b, vector_t *result) {
    if (!ctx || !a || !b || !result) return -1;
    
    int grid_size = (a->size + BLOCK_SIZE - 1) / BLOCK_SIZE;
    
    CUDA_CHECK(cudaEventRecord(ctx->start_event, ctx->streams[ctx->current_device]));
    
    vector_add_kernel<<<grid_size, BLOCK_SIZE, 0, ctx->streams[ctx->current_device]>>>(
        a->data, b->data, result->data, a->size);
    
    CUDA_CHECK(cudaEventRecord(ctx->stop_event, ctx->streams[ctx->current_device]));
    CUDA_CHECK(cudaEventSynchronize(ctx->stop_event));
    
    CUDA_CHECK(cudaEventElapsedTime(&ctx->last_kernel_time, ctx->start_event, ctx->stop_event));
    
    return 0;
}

int cuda_matrix_multiply(cuda_context_t *ctx, matrix_t *a, matrix_t *b, matrix_t *result) {
    if (!ctx || !a || !b || !result) return -1;
    
    dim3 block_size(16, 16);
    dim3 grid_size((result->cols + block_size.x - 1) / block_size.x,
                   (result->rows + block_size.y - 1) / block_size.y);
    
    CUDA_CHECK(cudaEventRecord(ctx->start_event, ctx->streams[ctx->current_device]));
    
    matrix_multiply_kernel<<<grid_size, block_size, 0, ctx->streams[ctx->current_device]>>>(
        a->data, b->data, result->data, a->rows, b->cols, a->cols);
    
    CUDA_CHECK(cudaEventRecord(ctx->stop_event, ctx->streams[ctx->current_device]));
    CUDA_CHECK(cudaEventSynchronize(ctx->stop_event));
    
    CUDA_CHECK(cudaEventElapsedTime(&ctx->last_kernel_time, ctx->start_event, ctx->stop_event));
    
    return 0;
}

int cuda_convolution_2d(cuda_context_t *ctx, matrix_t *input, matrix_t *kernel, matrix_t *output) {
    if (!ctx || !input || !kernel || !output) return -1;
    
    dim3 block_size(16, 16);
    dim3 grid_size((input->cols + block_size.x - 1) / block_size.x,
                   (input->rows + block_size.y - 1) / block_size.y);
    
    CUDA_CHECK(cudaEventRecord(ctx->start_event, ctx->streams[ctx->current_device]));
    
    convolution_2d_kernel<<<grid_size, block_size, 0, ctx->streams[ctx->current_device]>>>(
        input->data, kernel->data, output->data, input->cols, input->rows, kernel->cols);
    
    CUDA_CHECK(cudaEventRecord(ctx->stop_event, ctx->streams[ctx->current_device]));
    CUDA_CHECK(cudaEventSynchronize(ctx->stop_event));
    
    CUDA_CHECK(cudaEventElapsedTime(&ctx->last_kernel_time, ctx->start_event, ctx->stop_event));
    
    return 0;
}

int cuda_allocate_matrix(cuda_context_t *ctx, matrix_t *matrix, int rows, int cols) {
    if (!ctx || !matrix) return -1;
    
    matrix->rows = rows;
    matrix->cols = cols;
    
    size_t size = rows * cols * sizeof(float);
    CUDA_CHECK(cudaMallocPitch((void**)&matrix->data, &matrix->pitch, cols * sizeof(float), rows));
    
    matrix->on_device = true;
    
    return 0;
}

int cuda_cleanup(cuda_context_t *ctx) {
    if (!ctx) return -1;
    
    // Cleanup streams and handles
    for (int i = 0; i < ctx->device_count; i++) {
        CUDA_CHECK(cudaSetDevice(i));
        CUDA_CHECK(cudaStreamDestroy(ctx->streams[i]));
        cublasDestroy(ctx->cublas_handles[i]);
        curandDestroyGenerator(ctx->curand_generators[i]);
    }
    
    // Cleanup events
    CUDA_CHECK(cudaEventDestroy(ctx->start_event));
    CUDA_CHECK(cudaEventDestroy(ctx->stop_event));
    
    printf("CUDA cleanup completed\n");
    return 0;
}

This comprehensive graphics and GPU programming guide provides:

  1. Advanced Vulkan Engine: Complete modern graphics API implementation with command buffer management, pipeline creation, and resource management
  2. CUDA Parallel Computing: High-performance GPU computing framework with matrix operations, convolution, and memory management
  3. Multi-GPU Support: Device selection and management for scalable computing
  4. Memory Management: Efficient GPU memory allocation and transfer optimization
  5. Performance Profiling: Built-in timing and performance measurement tools
  6. Compute Shaders: GPU-accelerated computing for graphics and scientific applications

The code demonstrates advanced GPU programming techniques essential for building high-performance graphics applications and parallel computing systems.