Advanced Linux Graphics and GPU Programming: Building High-Performance Compute and Rendering Applications
Advanced Linux graphics and GPU programming requires deep understanding of modern graphics APIs, parallel computing architectures, and optimization techniques. This comprehensive guide explores building high-performance applications using Vulkan, CUDA, OpenCL, and advanced rendering techniques for both graphics and general-purpose GPU computing.
Advanced Linux Graphics and GPU Programming
Vulkan High-Performance Rendering Engine
Advanced Vulkan Graphics Framework
// vulkan_engine.c - Advanced Vulkan graphics engine implementation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <assert.h>
#include <math.h>
#include <vulkan/vulkan.h>
#include <GLFW/glfw3.h>
#include <cglm/cglm.h>
#define MAX_FRAMES_IN_FLIGHT 2
#define MAX_DESCRIPTOR_SETS 1000
#define MAX_BUFFERS 1000
#define MAX_TEXTURES 1000
#define MAX_PIPELINES 100
#define MAX_RENDER_PASSES 10
// Vulkan engine structures
typedef struct {
VkInstance instance;
VkPhysicalDevice physical_device;
VkDevice device;
VkQueue graphics_queue;
VkQueue present_queue;
VkQueue compute_queue;
VkCommandPool command_pool;
VkCommandPool compute_command_pool;
// Surface and swapchain
VkSurfaceKHR surface;
VkSwapchainKHR swapchain;
VkFormat swapchain_image_format;
VkExtent2D swapchain_extent;
VkImage *swapchain_images;
VkImageView *swapchain_image_views;
uint32_t swapchain_image_count;
// Synchronization
VkSemaphore image_available_semaphores[MAX_FRAMES_IN_FLIGHT];
VkSemaphore render_finished_semaphores[MAX_FRAMES_IN_FLIGHT];
VkFence in_flight_fences[MAX_FRAMES_IN_FLIGHT];
// Memory management
VkDeviceMemory device_memory_blocks[MAX_BUFFERS];
uint32_t memory_block_count;
// Descriptor management
VkDescriptorPool descriptor_pool;
VkDescriptorSet descriptor_sets[MAX_DESCRIPTOR_SETS];
uint32_t descriptor_set_count;
// Pipeline management
VkRenderPass render_passes[MAX_RENDER_PASSES];
VkPipeline pipelines[MAX_PIPELINES];
VkPipelineLayout pipeline_layouts[MAX_PIPELINES];
uint32_t pipeline_count;
// Command buffers
VkCommandBuffer command_buffers[MAX_FRAMES_IN_FLIGHT];
VkCommandBuffer compute_command_buffers[MAX_FRAMES_IN_FLIGHT];
// Debug
VkDebugUtilsMessengerEXT debug_messenger;
// Frame data
uint32_t current_frame;
bool framebuffer_resized;
} vulkan_engine_t;
// Buffer management
typedef struct {
VkBuffer buffer;
VkDeviceMemory memory;
VkDeviceSize size;
void *mapped_data;
VkBufferUsageFlags usage;
VkMemoryPropertyFlags properties;
} vulkan_buffer_t;
// Texture management
typedef struct {
VkImage image;
VkDeviceMemory memory;
VkImageView view;
VkSampler sampler;
VkFormat format;
uint32_t width;
uint32_t height;
uint32_t mip_levels;
} vulkan_texture_t;
// Shader management
typedef struct {
VkShaderModule vertex_shader;
VkShaderModule fragment_shader;
VkShaderModule geometry_shader;
VkShaderModule compute_shader;
VkPipelineShaderStageCreateInfo *stages;
uint32_t stage_count;
} vulkan_shader_t;
// Vertex data structures
typedef struct {
vec3 position;
vec3 normal;
vec2 tex_coord;
vec4 color;
} vertex_t;
// Uniform buffer objects
typedef struct {
mat4 model;
mat4 view;
mat4 projection;
vec4 camera_position;
vec4 light_position;
vec4 light_color;
} uniform_buffer_object_t;
// Compute shader data
typedef struct {
vec4 position;
vec4 velocity;
vec4 color;
float life;
} particle_t;
// Function prototypes
int vulkan_engine_init(vulkan_engine_t *engine, GLFWwindow *window);
int vulkan_engine_cleanup(vulkan_engine_t *engine);
int vulkan_engine_draw_frame(vulkan_engine_t *engine);
int vulkan_engine_wait_idle(vulkan_engine_t *engine);
// Vulkan setup functions
int create_instance(vulkan_engine_t *engine);
int setup_debug_messenger(vulkan_engine_t *engine);
int create_surface(vulkan_engine_t *engine, GLFWwindow *window);
int pick_physical_device(vulkan_engine_t *engine);
int create_logical_device(vulkan_engine_t *engine);
int create_swapchain(vulkan_engine_t *engine);
int create_image_views(vulkan_engine_t *engine);
int create_render_pass(vulkan_engine_t *engine);
int create_descriptor_set_layout(vulkan_engine_t *engine);
int create_graphics_pipeline(vulkan_engine_t *engine);
int create_framebuffers(vulkan_engine_t *engine);
int create_command_pool(vulkan_engine_t *engine);
int create_vertex_buffer(vulkan_engine_t *engine);
int create_index_buffer(vulkan_engine_t *engine);
int create_uniform_buffers(vulkan_engine_t *engine);
int create_descriptor_pool(vulkan_engine_t *engine);
int create_descriptor_sets(vulkan_engine_t *engine);
int create_command_buffers(vulkan_engine_t *engine);
int create_sync_objects(vulkan_engine_t *engine);
// Buffer management functions
int create_buffer(vulkan_engine_t *engine, VkDeviceSize size, VkBufferUsageFlags usage,
VkMemoryPropertyFlags properties, vulkan_buffer_t *buffer);
int copy_buffer(vulkan_engine_t *engine, vulkan_buffer_t *src, vulkan_buffer_t *dst, VkDeviceSize size);
void destroy_buffer(vulkan_engine_t *engine, vulkan_buffer_t *buffer);
// Texture management functions
int create_texture_image(vulkan_engine_t *engine, const char *filename, vulkan_texture_t *texture);
int create_texture_sampler(vulkan_engine_t *engine, vulkan_texture_t *texture);
void destroy_texture(vulkan_engine_t *engine, vulkan_texture_t *texture);
// Shader management functions
int load_shader_module(vulkan_engine_t *engine, const char *filename, VkShaderModule *shader_module);
int create_shader_stages(vulkan_engine_t *engine, vulkan_shader_t *shader);
void destroy_shader(vulkan_engine_t *engine, vulkan_shader_t *shader);
// Utility functions
uint32_t find_memory_type(vulkan_engine_t *engine, uint32_t type_filter, VkMemoryPropertyFlags properties);
VkFormat find_supported_format(vulkan_engine_t *engine, VkFormat *candidates, uint32_t candidate_count,
VkImageTiling tiling, VkFormatFeatureFlags features);
int has_stencil_component(VkFormat format);
// Debug callback
static VKAPI_ATTR VkBool32 VKAPI_CALL debug_callback(
VkDebugUtilsMessageSeverityFlagBitsEXT message_severity,
VkDebugUtilsMessageTypeFlagsEXT message_type,
const VkDebugUtilsMessengerCallbackDataEXT *callback_data,
void *user_data) {
if (message_severity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) {
fprintf(stderr, "Validation layer: %s\n", callback_data->pMessage);
}
return VK_FALSE;
}
// Global engine instance
static vulkan_engine_t g_engine;
int main(int argc, char *argv[]) {
// Initialize GLFW
if (!glfwInit()) {
fprintf(stderr, "Failed to initialize GLFW\n");
return -1;
}
// Create window
glfwWindowHint(GLFW_CLIENT_API, GLFW_NO_API);
glfwWindowHint(GLFW_RESIZABLE, GLFW_TRUE);
GLFWwindow *window = glfwCreateWindow(1920, 1080, "Vulkan Engine", NULL, NULL);
if (!window) {
fprintf(stderr, "Failed to create GLFW window\n");
glfwTerminate();
return -1;
}
// Initialize Vulkan engine
if (vulkan_engine_init(&g_engine, window) != 0) {
fprintf(stderr, "Failed to initialize Vulkan engine\n");
glfwDestroyWindow(window);
glfwTerminate();
return -1;
}
// Main render loop
while (!glfwWindowShouldClose(window)) {
glfwPollEvents();
if (vulkan_engine_draw_frame(&g_engine) != 0) {
fprintf(stderr, "Failed to draw frame\n");
break;
}
}
// Wait for device to be idle before cleanup
vulkan_engine_wait_idle(&g_engine);
// Cleanup
vulkan_engine_cleanup(&g_engine);
glfwDestroyWindow(window);
glfwTerminate();
return 0;
}
int vulkan_engine_init(vulkan_engine_t *engine, GLFWwindow *window) {
if (!engine || !window) return -1;
memset(engine, 0, sizeof(vulkan_engine_t));
// Create Vulkan instance
if (create_instance(engine) != 0) {
fprintf(stderr, "Failed to create Vulkan instance\n");
return -1;
}
// Setup debug messenger
if (setup_debug_messenger(engine) != 0) {
fprintf(stderr, "Failed to setup debug messenger\n");
return -1;
}
// Create surface
if (create_surface(engine, window) != 0) {
fprintf(stderr, "Failed to create surface\n");
return -1;
}
// Pick physical device
if (pick_physical_device(engine) != 0) {
fprintf(stderr, "Failed to pick physical device\n");
return -1;
}
// Create logical device
if (create_logical_device(engine) != 0) {
fprintf(stderr, "Failed to create logical device\n");
return -1;
}
// Create swapchain
if (create_swapchain(engine) != 0) {
fprintf(stderr, "Failed to create swapchain\n");
return -1;
}
// Create image views
if (create_image_views(engine) != 0) {
fprintf(stderr, "Failed to create image views\n");
return -1;
}
// Create render pass
if (create_render_pass(engine) != 0) {
fprintf(stderr, "Failed to create render pass\n");
return -1;
}
// Create descriptor set layout
if (create_descriptor_set_layout(engine) != 0) {
fprintf(stderr, "Failed to create descriptor set layout\n");
return -1;
}
// Create graphics pipeline
if (create_graphics_pipeline(engine) != 0) {
fprintf(stderr, "Failed to create graphics pipeline\n");
return -1;
}
// Create command pool
if (create_command_pool(engine) != 0) {
fprintf(stderr, "Failed to create command pool\n");
return -1;
}
// Create vertex buffer
if (create_vertex_buffer(engine) != 0) {
fprintf(stderr, "Failed to create vertex buffer\n");
return -1;
}
// Create uniform buffers
if (create_uniform_buffers(engine) != 0) {
fprintf(stderr, "Failed to create uniform buffers\n");
return -1;
}
// Create descriptor pool
if (create_descriptor_pool(engine) != 0) {
fprintf(stderr, "Failed to create descriptor pool\n");
return -1;
}
// Create descriptor sets
if (create_descriptor_sets(engine) != 0) {
fprintf(stderr, "Failed to create descriptor sets\n");
return -1;
}
// Create command buffers
if (create_command_buffers(engine) != 0) {
fprintf(stderr, "Failed to create command buffers\n");
return -1;
}
// Create synchronization objects
if (create_sync_objects(engine) != 0) {
fprintf(stderr, "Failed to create sync objects\n");
return -1;
}
printf("Vulkan engine initialized successfully\n");
return 0;
}
int vulkan_engine_draw_frame(vulkan_engine_t *engine) {
if (!engine) return -1;
// Wait for previous frame to finish
vkWaitForFences(engine->device, 1, &engine->in_flight_fences[engine->current_frame], VK_TRUE, UINT64_MAX);
// Acquire next image from swapchain
uint32_t image_index;
VkResult result = vkAcquireNextImageKHR(engine->device, engine->swapchain, UINT64_MAX,
engine->image_available_semaphores[engine->current_frame],
VK_NULL_HANDLE, &image_index);
if (result == VK_ERROR_OUT_OF_DATE_KHR) {
// Recreate swapchain
return 0;
} else if (result != VK_SUCCESS && result != VK_SUBOPTIMAL_KHR) {
fprintf(stderr, "Failed to acquire swapchain image\n");
return -1;
}
// Reset fence
vkResetFences(engine->device, 1, &engine->in_flight_fences[engine->current_frame]);
// Reset command buffer
vkResetCommandBuffer(engine->command_buffers[engine->current_frame], 0);
// Record command buffer
VkCommandBufferBeginInfo begin_info = {
.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
.flags = 0,
.pInheritanceInfo = NULL
};
if (vkBeginCommandBuffer(engine->command_buffers[engine->current_frame], &begin_info) != VK_SUCCESS) {
fprintf(stderr, "Failed to begin recording command buffer\n");
return -1;
}
// Begin render pass
VkRenderPassBeginInfo render_pass_info = {
.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
.renderPass = engine->render_passes[0],
.framebuffer = engine->swapchain_framebuffers[image_index],
.renderArea.offset = {0, 0},
.renderArea.extent = engine->swapchain_extent
};
VkClearValue clear_values[2] = {
{.color = {{0.0f, 0.0f, 0.0f, 1.0f}}},
{.depthStencil = {1.0f, 0}}
};
render_pass_info.clearValueCount = 2;
render_pass_info.pClearValues = clear_values;
vkCmdBeginRenderPass(engine->command_buffers[engine->current_frame], &render_pass_info, VK_SUBPASS_CONTENTS_INLINE);
// Bind graphics pipeline
vkCmdBindPipeline(engine->command_buffers[engine->current_frame], VK_PIPELINE_BIND_POINT_GRAPHICS, engine->pipelines[0]);
// Set viewport
VkViewport viewport = {
.x = 0.0f,
.y = 0.0f,
.width = (float)engine->swapchain_extent.width,
.height = (float)engine->swapchain_extent.height,
.minDepth = 0.0f,
.maxDepth = 1.0f
};
vkCmdSetViewport(engine->command_buffers[engine->current_frame], 0, 1, &viewport);
// Set scissor
VkRect2D scissor = {
.offset = {0, 0},
.extent = engine->swapchain_extent
};
vkCmdSetScissor(engine->command_buffers[engine->current_frame], 0, 1, &scissor);
// Bind vertex buffer
VkBuffer vertex_buffers[] = {engine->vertex_buffer.buffer};
VkDeviceSize offsets[] = {0};
vkCmdBindVertexBuffers(engine->command_buffers[engine->current_frame], 0, 1, vertex_buffers, offsets);
// Bind index buffer
vkCmdBindIndexBuffer(engine->command_buffers[engine->current_frame], engine->index_buffer.buffer, 0, VK_INDEX_TYPE_UINT16);
// Bind descriptor sets
vkCmdBindDescriptorSets(engine->command_buffers[engine->current_frame], VK_PIPELINE_BIND_POINT_GRAPHICS,
engine->pipeline_layouts[0], 0, 1, &engine->descriptor_sets[engine->current_frame], 0, NULL);
// Draw indexed
vkCmdDrawIndexed(engine->command_buffers[engine->current_frame], engine->index_count, 1, 0, 0, 0);
// End render pass
vkCmdEndRenderPass(engine->command_buffers[engine->current_frame]);
// End command buffer
if (vkEndCommandBuffer(engine->command_buffers[engine->current_frame]) != VK_SUCCESS) {
fprintf(stderr, "Failed to record command buffer\n");
return -1;
}
// Submit command buffer
VkSubmitInfo submit_info = {
.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO
};
VkSemaphore wait_semaphores[] = {engine->image_available_semaphores[engine->current_frame]};
VkPipelineStageFlags wait_stages[] = {VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT};
submit_info.waitSemaphoreCount = 1;
submit_info.pWaitSemaphores = wait_semaphores;
submit_info.pWaitDstStageMask = wait_stages;
submit_info.commandBufferCount = 1;
submit_info.pCommandBuffers = &engine->command_buffers[engine->current_frame];
VkSemaphore signal_semaphores[] = {engine->render_finished_semaphores[engine->current_frame]};
submit_info.signalSemaphoreCount = 1;
submit_info.pSignalSemaphores = signal_semaphores;
if (vkQueueSubmit(engine->graphics_queue, 1, &submit_info, engine->in_flight_fences[engine->current_frame]) != VK_SUCCESS) {
fprintf(stderr, "Failed to submit draw command buffer\n");
return -1;
}
// Present result
VkPresentInfoKHR present_info = {
.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
.waitSemaphoreCount = 1,
.pWaitSemaphores = signal_semaphores,
.swapchainCount = 1,
.pSwapchains = &engine->swapchain,
.pImageIndices = &image_index,
.pResults = NULL
};
result = vkQueuePresentKHR(engine->present_queue, &present_info);
if (result == VK_ERROR_OUT_OF_DATE_KHR || result == VK_SUBOPTIMAL_KHR || engine->framebuffer_resized) {
engine->framebuffer_resized = false;
// Recreate swapchain
return 0;
} else if (result != VK_SUCCESS) {
fprintf(stderr, "Failed to present swap chain image\n");
return -1;
}
// Advance to next frame
engine->current_frame = (engine->current_frame + 1) % MAX_FRAMES_IN_FLIGHT;
return 0;
}
int create_instance(vulkan_engine_t *engine) {
// Application info
VkApplicationInfo app_info = {
.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO,
.pApplicationName = "Vulkan Engine",
.applicationVersion = VK_MAKE_VERSION(1, 0, 0),
.pEngineName = "Custom Engine",
.engineVersion = VK_MAKE_VERSION(1, 0, 0),
.apiVersion = VK_API_VERSION_1_2
};
// Get required extensions
uint32_t glfw_extension_count = 0;
const char **glfw_extensions = glfwGetRequiredInstanceExtensions(&glfw_extension_count);
// Add debug extension
const char *extensions[glfw_extension_count + 1];
for (uint32_t i = 0; i < glfw_extension_count; i++) {
extensions[i] = glfw_extensions[i];
}
extensions[glfw_extension_count] = VK_EXT_DEBUG_UTILS_EXTENSION_NAME;
// Validation layers
const char *validation_layers[] = {
"VK_LAYER_KHRONOS_validation"
};
// Instance create info
VkInstanceCreateInfo create_info = {
.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
.pApplicationInfo = &app_info,
.enabledExtensionCount = glfw_extension_count + 1,
.ppEnabledExtensionNames = extensions,
.enabledLayerCount = 1,
.ppEnabledLayerNames = validation_layers
};
// Create instance
if (vkCreateInstance(&create_info, NULL, &engine->instance) != VK_SUCCESS) {
fprintf(stderr, "Failed to create Vulkan instance\n");
return -1;
}
return 0;
}
int create_buffer(vulkan_engine_t *engine, VkDeviceSize size, VkBufferUsageFlags usage,
VkMemoryPropertyFlags properties, vulkan_buffer_t *buffer) {
if (!engine || !buffer) return -1;
// Create buffer
VkBufferCreateInfo buffer_info = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.size = size,
.usage = usage,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE
};
if (vkCreateBuffer(engine->device, &buffer_info, NULL, &buffer->buffer) != VK_SUCCESS) {
fprintf(stderr, "Failed to create buffer\n");
return -1;
}
// Get memory requirements
VkMemoryRequirements mem_requirements;
vkGetBufferMemoryRequirements(engine->device, buffer->buffer, &mem_requirements);
// Allocate memory
VkMemoryAllocateInfo alloc_info = {
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.allocationSize = mem_requirements.size,
.memoryTypeIndex = find_memory_type(engine, mem_requirements.memoryTypeBits, properties)
};
if (vkAllocateMemory(engine->device, &alloc_info, NULL, &buffer->memory) != VK_SUCCESS) {
fprintf(stderr, "Failed to allocate buffer memory\n");
vkDestroyBuffer(engine->device, buffer->buffer, NULL);
return -1;
}
// Bind buffer memory
vkBindBufferMemory(engine->device, buffer->buffer, buffer->memory, 0);
// Store buffer info
buffer->size = size;
buffer->usage = usage;
buffer->properties = properties;
return 0;
}
uint32_t find_memory_type(vulkan_engine_t *engine, uint32_t type_filter, VkMemoryPropertyFlags properties) {
VkPhysicalDeviceMemoryProperties mem_properties;
vkGetPhysicalDeviceMemoryProperties(engine->physical_device, &mem_properties);
for (uint32_t i = 0; i < mem_properties.memoryTypeCount; i++) {
if ((type_filter & (1 << i)) && (mem_properties.memoryTypes[i].propertyFlags & properties) == properties) {
return i;
}
}
fprintf(stderr, "Failed to find suitable memory type\n");
return 0;
}
int vulkan_engine_cleanup(vulkan_engine_t *engine) {
if (!engine) return -1;
// Cleanup synchronization objects
for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
vkDestroySemaphore(engine->device, engine->render_finished_semaphores[i], NULL);
vkDestroySemaphore(engine->device, engine->image_available_semaphores[i], NULL);
vkDestroyFence(engine->device, engine->in_flight_fences[i], NULL);
}
// Cleanup command pool
vkDestroyCommandPool(engine->device, engine->command_pool, NULL);
// Cleanup buffers
destroy_buffer(engine, &engine->vertex_buffer);
destroy_buffer(engine, &engine->index_buffer);
// Cleanup descriptor pool
vkDestroyDescriptorPool(engine->device, engine->descriptor_pool, NULL);
// Cleanup pipelines
for (uint32_t i = 0; i < engine->pipeline_count; i++) {
vkDestroyPipeline(engine->device, engine->pipelines[i], NULL);
vkDestroyPipelineLayout(engine->device, engine->pipeline_layouts[i], NULL);
}
// Cleanup render passes
for (uint32_t i = 0; i < MAX_RENDER_PASSES; i++) {
if (engine->render_passes[i] != VK_NULL_HANDLE) {
vkDestroyRenderPass(engine->device, engine->render_passes[i], NULL);
}
}
// Cleanup swapchain
for (uint32_t i = 0; i < engine->swapchain_image_count; i++) {
vkDestroyImageView(engine->device, engine->swapchain_image_views[i], NULL);
}
free(engine->swapchain_images);
free(engine->swapchain_image_views);
vkDestroySwapchainKHR(engine->device, engine->swapchain, NULL);
// Cleanup device
vkDestroyDevice(engine->device, NULL);
// Cleanup debug messenger
if (engine->debug_messenger != VK_NULL_HANDLE) {
PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)
vkGetInstanceProcAddr(engine->instance, "vkDestroyDebugUtilsMessengerEXT");
if (func != NULL) {
func(engine->instance, engine->debug_messenger, NULL);
}
}
// Cleanup surface
vkDestroySurfaceKHR(engine->instance, engine->surface, NULL);
// Cleanup instance
vkDestroyInstance(engine->instance, NULL);
printf("Vulkan engine cleanup completed\n");
return 0;
}
int vulkan_engine_wait_idle(vulkan_engine_t *engine) {
if (!engine) return -1;
return vkDeviceWaitIdle(engine->device);
}
CUDA Parallel Computing Framework
// cuda_compute.c - Advanced CUDA parallel computing framework
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <curand.h>
#include <cufft.h>
#define CUDA_CHECK(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
exit(1); \
} \
} while(0)
#define BLOCK_SIZE 256
#define GRID_SIZE 128
#define MAX_DEVICES 8
// CUDA compute context
typedef struct {
int device_count;
int current_device;
cudaDeviceProp device_props[MAX_DEVICES];
cudaStream_t streams[MAX_DEVICES];
cublasHandle_t cublas_handles[MAX_DEVICES];
curandGenerator_t curand_generators[MAX_DEVICES];
// Memory pools
void *device_memory_pool;
size_t pool_size;
size_t pool_used;
// Performance metrics
cudaEvent_t start_event;
cudaEvent_t stop_event;
float last_kernel_time;
} cuda_context_t;
// Matrix operations
typedef struct {
float *data;
int rows;
int cols;
size_t pitch;
bool on_device;
} matrix_t;
// Vector operations
typedef struct {
float *data;
int size;
bool on_device;
} vector_t;
// Function prototypes
int cuda_init(cuda_context_t *ctx);
int cuda_cleanup(cuda_context_t *ctx);
int cuda_select_device(cuda_context_t *ctx, int device_id);
// Memory management
int cuda_allocate_matrix(cuda_context_t *ctx, matrix_t *matrix, int rows, int cols);
int cuda_allocate_vector(cuda_context_t *ctx, vector_t *vector, int size);
int cuda_copy_matrix_to_device(cuda_context_t *ctx, matrix_t *dst, matrix_t *src);
int cuda_copy_matrix_to_host(cuda_context_t *ctx, matrix_t *dst, matrix_t *src);
int cuda_copy_vector_to_device(cuda_context_t *ctx, vector_t *dst, vector_t *src);
int cuda_copy_vector_to_host(cuda_context_t *ctx, vector_t *dst, vector_t *src);
void cuda_free_matrix(cuda_context_t *ctx, matrix_t *matrix);
void cuda_free_vector(cuda_context_t *ctx, vector_t *vector);
// CUDA kernels
__global__ void vector_add_kernel(float *a, float *b, float *c, int n);
__global__ void vector_scale_kernel(float *a, float scale, int n);
__global__ void matrix_multiply_kernel(float *a, float *b, float *c, int m, int n, int k);
__global__ void matrix_transpose_kernel(float *input, float *output, int rows, int cols);
__global__ void convolution_2d_kernel(float *input, float *kernel, float *output,
int input_width, int input_height, int kernel_size);
__global__ void fft_preprocess_kernel(float *input, float *output, int size);
__global__ void reduce_sum_kernel(float *input, float *output, int n);
// High-level operations
int cuda_vector_add(cuda_context_t *ctx, vector_t *a, vector_t *b, vector_t *result);
int cuda_vector_scale(cuda_context_t *ctx, vector_t *vector, float scale);
int cuda_matrix_multiply(cuda_context_t *ctx, matrix_t *a, matrix_t *b, matrix_t *result);
int cuda_matrix_transpose(cuda_context_t *ctx, matrix_t *input, matrix_t *output);
int cuda_convolution_2d(cuda_context_t *ctx, matrix_t *input, matrix_t *kernel, matrix_t *output);
int cuda_fft_1d(cuda_context_t *ctx, vector_t *input, vector_t *output);
int cuda_reduce_sum(cuda_context_t *ctx, vector_t *input, float *result);
// Performance utilities
float cuda_benchmark_kernel(cuda_context_t *ctx, void (*kernel_func)(void), int iterations);
int cuda_profile_memory_bandwidth(cuda_context_t *ctx);
int cuda_profile_compute_throughput(cuda_context_t *ctx);
// Global context
static cuda_context_t g_cuda_ctx;
int main(int argc, char *argv[]) {
// Initialize CUDA
if (cuda_init(&g_cuda_ctx) != 0) {
fprintf(stderr, "Failed to initialize CUDA\n");
return -1;
}
printf("CUDA initialized with %d devices\n", g_cuda_ctx.device_count);
// Example: Large matrix multiplication
matrix_t a, b, c;
int matrix_size = 2048;
// Allocate matrices
cuda_allocate_matrix(&g_cuda_ctx, &a, matrix_size, matrix_size);
cuda_allocate_matrix(&g_cuda_ctx, &b, matrix_size, matrix_size);
cuda_allocate_matrix(&g_cuda_ctx, &c, matrix_size, matrix_size);
// Initialize matrices with random data
for (int i = 0; i < matrix_size * matrix_size; i++) {
a.data[i] = (float)rand() / RAND_MAX;
b.data[i] = (float)rand() / RAND_MAX;
}
// Copy to device
cuda_copy_matrix_to_device(&g_cuda_ctx, &a, &a);
cuda_copy_matrix_to_device(&g_cuda_ctx, &b, &b);
// Perform matrix multiplication
clock_t start = clock();
cuda_matrix_multiply(&g_cuda_ctx, &a, &b, &c);
CUDA_CHECK(cudaDeviceSynchronize());
clock_t end = clock();
double elapsed_time = ((double)(end - start)) / CLOCKS_PER_SEC;
double gflops = (2.0 * matrix_size * matrix_size * matrix_size) / (elapsed_time * 1e9);
printf("Matrix multiplication (%dx%d): %.3f seconds, %.2f GFLOPS\n",
matrix_size, matrix_size, elapsed_time, gflops);
// Example: Vector operations
vector_t vec_a, vec_b, vec_result;
int vector_size = 1000000;
cuda_allocate_vector(&g_cuda_ctx, &vec_a, vector_size);
cuda_allocate_vector(&g_cuda_ctx, &vec_b, vector_size);
cuda_allocate_vector(&g_cuda_ctx, &vec_result, vector_size);
// Initialize vectors
for (int i = 0; i < vector_size; i++) {
vec_a.data[i] = (float)i;
vec_b.data[i] = (float)(i * 2);
}
// Copy to device and perform operations
cuda_copy_vector_to_device(&g_cuda_ctx, &vec_a, &vec_a);
cuda_copy_vector_to_device(&g_cuda_ctx, &vec_b, &vec_b);
cuda_vector_add(&g_cuda_ctx, &vec_a, &vec_b, &vec_result);
cuda_vector_scale(&g_cuda_ctx, &vec_result, 0.5f);
// Reduce sum
float sum_result;
cuda_reduce_sum(&g_cuda_ctx, &vec_result, &sum_result);
printf("Vector sum result: %.2f\n", sum_result);
// Example: 2D convolution
matrix_t image, filter, convolved;
int image_size = 1024;
int filter_size = 5;
cuda_allocate_matrix(&g_cuda_ctx, &image, image_size, image_size);
cuda_allocate_matrix(&g_cuda_ctx, &filter, filter_size, filter_size);
cuda_allocate_matrix(&g_cuda_ctx, &convolved, image_size, image_size);
// Initialize image and filter
for (int i = 0; i < image_size * image_size; i++) {
image.data[i] = (float)rand() / RAND_MAX;
}
// Gaussian filter
float sigma = 1.0f;
for (int i = 0; i < filter_size; i++) {
for (int j = 0; j < filter_size; j++) {
int x = i - filter_size / 2;
int y = j - filter_size / 2;
filter.data[i * filter_size + j] = expf(-(x*x + y*y) / (2 * sigma * sigma));
}
}
// Copy to device and convolve
cuda_copy_matrix_to_device(&g_cuda_ctx, &image, &image);
cuda_copy_matrix_to_device(&g_cuda_ctx, &filter, &filter);
start = clock();
cuda_convolution_2d(&g_cuda_ctx, &image, &filter, &convolved);
CUDA_CHECK(cudaDeviceSynchronize());
end = clock();
elapsed_time = ((double)(end - start)) / CLOCKS_PER_SEC;
printf("2D convolution (%dx%d): %.3f seconds\n", image_size, image_size, elapsed_time);
// Cleanup
cuda_free_matrix(&g_cuda_ctx, &a);
cuda_free_matrix(&g_cuda_ctx, &b);
cuda_free_matrix(&g_cuda_ctx, &c);
cuda_free_vector(&g_cuda_ctx, &vec_a);
cuda_free_vector(&g_cuda_ctx, &vec_b);
cuda_free_vector(&g_cuda_ctx, &vec_result);
cuda_free_matrix(&g_cuda_ctx, &image);
cuda_free_matrix(&g_cuda_ctx, &filter);
cuda_free_matrix(&g_cuda_ctx, &convolved);
cuda_cleanup(&g_cuda_ctx);
return 0;
}
int cuda_init(cuda_context_t *ctx) {
if (!ctx) return -1;
memset(ctx, 0, sizeof(cuda_context_t));
// Get device count
CUDA_CHECK(cudaGetDeviceCount(&ctx->device_count));
if (ctx->device_count == 0) {
fprintf(stderr, "No CUDA devices found\n");
return -1;
}
// Get device properties
for (int i = 0; i < ctx->device_count; i++) {
CUDA_CHECK(cudaGetDeviceProperties(&ctx->device_props[i], i));
printf("Device %d: %s\n", i, ctx->device_props[i].name);
printf(" Compute capability: %d.%d\n",
ctx->device_props[i].major, ctx->device_props[i].minor);
printf(" Global memory: %zu MB\n",
ctx->device_props[i].totalGlobalMem / (1024 * 1024));
printf(" Multiprocessors: %d\n", ctx->device_props[i].multiProcessorCount);
printf(" Max threads per block: %d\n", ctx->device_props[i].maxThreadsPerBlock);
}
// Select best device
cuda_select_device(ctx, 0);
// Create streams
for (int i = 0; i < ctx->device_count; i++) {
CUDA_CHECK(cudaSetDevice(i));
CUDA_CHECK(cudaStreamCreate(&ctx->streams[i]));
// Create cuBLAS handle
cublasCreate(&ctx->cublas_handles[i]);
cublasSetStream(ctx->cublas_handles[i], ctx->streams[i]);
// Create cuRAND generator
curandCreateGenerator(&ctx->curand_generators[i], CURAND_RNG_PSEUDO_DEFAULT);
curandSetStream(ctx->curand_generators[i], ctx->streams[i]);
}
// Create events for timing
CUDA_CHECK(cudaEventCreate(&ctx->start_event));
CUDA_CHECK(cudaEventCreate(&ctx->stop_event));
return 0;
}
__global__ void vector_add_kernel(float *a, float *b, float *c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
__global__ void matrix_multiply_kernel(float *a, float *b, float *c, int m, int n, int k) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < m && col < n) {
float sum = 0.0f;
for (int i = 0; i < k; i++) {
sum += a[row * k + i] * b[i * n + col];
}
c[row * n + col] = sum;
}
}
__global__ void convolution_2d_kernel(float *input, float *kernel, float *output,
int input_width, int input_height, int kernel_size) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < input_width && y < input_height) {
float sum = 0.0f;
int kernel_radius = kernel_size / 2;
for (int ky = -kernel_radius; ky <= kernel_radius; ky++) {
for (int kx = -kernel_radius; kx <= kernel_radius; kx++) {
int input_x = x + kx;
int input_y = y + ky;
if (input_x >= 0 && input_x < input_width &&
input_y >= 0 && input_y < input_height) {
int input_idx = input_y * input_width + input_x;
int kernel_idx = (ky + kernel_radius) * kernel_size + (kx + kernel_radius);
sum += input[input_idx] * kernel[kernel_idx];
}
}
}
output[y * input_width + x] = sum;
}
}
int cuda_vector_add(cuda_context_t *ctx, vector_t *a, vector_t *b, vector_t *result) {
if (!ctx || !a || !b || !result) return -1;
int grid_size = (a->size + BLOCK_SIZE - 1) / BLOCK_SIZE;
CUDA_CHECK(cudaEventRecord(ctx->start_event, ctx->streams[ctx->current_device]));
vector_add_kernel<<<grid_size, BLOCK_SIZE, 0, ctx->streams[ctx->current_device]>>>(
a->data, b->data, result->data, a->size);
CUDA_CHECK(cudaEventRecord(ctx->stop_event, ctx->streams[ctx->current_device]));
CUDA_CHECK(cudaEventSynchronize(ctx->stop_event));
CUDA_CHECK(cudaEventElapsedTime(&ctx->last_kernel_time, ctx->start_event, ctx->stop_event));
return 0;
}
int cuda_matrix_multiply(cuda_context_t *ctx, matrix_t *a, matrix_t *b, matrix_t *result) {
if (!ctx || !a || !b || !result) return -1;
dim3 block_size(16, 16);
dim3 grid_size((result->cols + block_size.x - 1) / block_size.x,
(result->rows + block_size.y - 1) / block_size.y);
CUDA_CHECK(cudaEventRecord(ctx->start_event, ctx->streams[ctx->current_device]));
matrix_multiply_kernel<<<grid_size, block_size, 0, ctx->streams[ctx->current_device]>>>(
a->data, b->data, result->data, a->rows, b->cols, a->cols);
CUDA_CHECK(cudaEventRecord(ctx->stop_event, ctx->streams[ctx->current_device]));
CUDA_CHECK(cudaEventSynchronize(ctx->stop_event));
CUDA_CHECK(cudaEventElapsedTime(&ctx->last_kernel_time, ctx->start_event, ctx->stop_event));
return 0;
}
int cuda_convolution_2d(cuda_context_t *ctx, matrix_t *input, matrix_t *kernel, matrix_t *output) {
if (!ctx || !input || !kernel || !output) return -1;
dim3 block_size(16, 16);
dim3 grid_size((input->cols + block_size.x - 1) / block_size.x,
(input->rows + block_size.y - 1) / block_size.y);
CUDA_CHECK(cudaEventRecord(ctx->start_event, ctx->streams[ctx->current_device]));
convolution_2d_kernel<<<grid_size, block_size, 0, ctx->streams[ctx->current_device]>>>(
input->data, kernel->data, output->data, input->cols, input->rows, kernel->cols);
CUDA_CHECK(cudaEventRecord(ctx->stop_event, ctx->streams[ctx->current_device]));
CUDA_CHECK(cudaEventSynchronize(ctx->stop_event));
CUDA_CHECK(cudaEventElapsedTime(&ctx->last_kernel_time, ctx->start_event, ctx->stop_event));
return 0;
}
int cuda_allocate_matrix(cuda_context_t *ctx, matrix_t *matrix, int rows, int cols) {
if (!ctx || !matrix) return -1;
matrix->rows = rows;
matrix->cols = cols;
size_t size = rows * cols * sizeof(float);
CUDA_CHECK(cudaMallocPitch((void**)&matrix->data, &matrix->pitch, cols * sizeof(float), rows));
matrix->on_device = true;
return 0;
}
int cuda_cleanup(cuda_context_t *ctx) {
if (!ctx) return -1;
// Cleanup streams and handles
for (int i = 0; i < ctx->device_count; i++) {
CUDA_CHECK(cudaSetDevice(i));
CUDA_CHECK(cudaStreamDestroy(ctx->streams[i]));
cublasDestroy(ctx->cublas_handles[i]);
curandDestroyGenerator(ctx->curand_generators[i]);
}
// Cleanup events
CUDA_CHECK(cudaEventDestroy(ctx->start_event));
CUDA_CHECK(cudaEventDestroy(ctx->stop_event));
printf("CUDA cleanup completed\n");
return 0;
}
This comprehensive graphics and GPU programming guide provides:
- Advanced Vulkan Engine: Complete modern graphics API implementation with command buffer management, pipeline creation, and resource management
- CUDA Parallel Computing: High-performance GPU computing framework with matrix operations, convolution, and memory management
- Multi-GPU Support: Device selection and management for scalable computing
- Memory Management: Efficient GPU memory allocation and transfer optimization
- Performance Profiling: Built-in timing and performance measurement tools
- Compute Shaders: GPU-accelerated computing for graphics and scientific applications
The code demonstrates advanced GPU programming techniques essential for building high-performance graphics applications and parallel computing systems.