Advanced Linux Container Runtime Programming: Building Container Engines and Orchestration Systems
Advanced Linux container runtime programming requires deep understanding of kernel namespaces, cgroups, and container orchestration principles. This comprehensive guide explores building custom container runtimes, implementing orchestration features, container networking, and creating production-grade container management systems.
Advanced Linux Container Runtime Programming
Custom Container Runtime Implementation
Complete Container Engine
// container_runtime.c - Advanced container runtime implementation
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/prctl.h>
#include <sys/capability.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <linux/limits.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <pthread.h>
#include <json-c/json.h>
#define STACK_SIZE (1024 * 1024)
#define MAX_CONTAINERS 100
#define MAX_MOUNTS 50
#define MAX_ENV_VARS 100
#define CGROUP_PATH "/sys/fs/cgroup"
#define CONTAINER_ROOT "/var/lib/containers"
// Container states
typedef enum {
CONTAINER_STATE_CREATED,
CONTAINER_STATE_RUNNING,
CONTAINER_STATE_PAUSED,
CONTAINER_STATE_STOPPED,
CONTAINER_STATE_EXITED,
CONTAINER_STATE_DEAD
} container_state_t;
// Container configuration
typedef struct {
char id[64];
char name[256];
char image[512];
char rootfs[PATH_MAX];
char hostname[256];
// Process configuration
char *command[256];
int argc;
char *env[MAX_ENV_VARS];
int env_count;
char working_dir[PATH_MAX];
uid_t uid;
gid_t gid;
// Resource limits
struct {
long memory_limit; // bytes
long memory_swap_limit; // bytes
int cpu_shares; // relative weight
int cpu_quota; // microseconds per period
int cpu_period; // microseconds
int pids_limit; // max number of PIDs
int io_weight; // IO weight (1-10000)
} resources;
// Network configuration
struct {
char bridge[64];
char ip_address[64];
char gateway[64];
char dns[64];
int port_mappings[100][2]; // host_port, container_port
int num_port_mappings;
} network;
// Security configuration
struct {
bool privileged;
bool readonly_rootfs;
int capabilities[64];
int num_capabilities;
char seccomp_profile[PATH_MAX];
char apparmor_profile[256];
char selinux_label[256];
} security;
// Mount points
struct {
char source[PATH_MAX];
char destination[PATH_MAX];
char type[64];
char options[256];
bool readonly;
} mounts[MAX_MOUNTS];
int num_mounts;
} container_config_t;
// Container runtime structure
typedef struct {
container_config_t config;
container_state_t state;
pid_t init_pid;
pid_t child_pid;
int exit_status;
// Namespaces
int ns_pid;
int ns_net;
int ns_mnt;
int ns_uts;
int ns_ipc;
int ns_user;
int ns_cgroup;
// Control channels
int parent_socket;
int child_socket;
// Statistics
struct {
time_t start_time;
time_t end_time;
uint64_t cpu_usage_ns;
uint64_t memory_usage_bytes;
uint64_t memory_max_usage_bytes;
uint64_t io_read_bytes;
uint64_t io_write_bytes;
uint64_t network_rx_bytes;
uint64_t network_tx_bytes;
} stats;
// Synchronization
pthread_mutex_t mutex;
pthread_cond_t state_cond;
} container_t;
// Container runtime manager
typedef struct {
container_t *containers[MAX_CONTAINERS];
int num_containers;
pthread_mutex_t containers_mutex;
// Runtime configuration
char runtime_root[PATH_MAX];
char cgroup_parent[256];
bool enable_selinux;
bool enable_apparmor;
// Network manager
struct {
char bridge_name[64];
char subnet[64];
int next_ip;
} network;
// Image manager
struct {
char registry_url[512];
char image_store[PATH_MAX];
} images;
} container_runtime_t;
// OCI runtime spec structures
typedef struct {
char oci_version[32];
container_config_t *config;
json_object *json_spec;
} oci_spec_t;
// Function prototypes
int runtime_init(container_runtime_t *runtime);
int runtime_create_container(container_runtime_t *runtime, container_config_t *config, container_t **container);
int runtime_start_container(container_runtime_t *runtime, const char *container_id);
int runtime_stop_container(container_runtime_t *runtime, const char *container_id, int timeout);
int runtime_delete_container(container_runtime_t *runtime, const char *container_id);
int runtime_list_containers(container_runtime_t *runtime, container_t **containers, int *count);
int runtime_exec_in_container(container_runtime_t *runtime, const char *container_id, char *command[]);
void runtime_cleanup(container_runtime_t *runtime);
// Container lifecycle functions
static int container_child_func(void *arg);
static int setup_container_namespaces(container_t *container);
static int setup_container_filesystem(container_t *container);
static int setup_container_cgroups(container_t *container);
static int setup_container_network(container_t *container);
static int setup_container_security(container_t *container);
static int apply_resource_limits(container_t *container);
static int mount_container_filesystems(container_t *container);
static int pivot_root_to_container(const char *new_root);
// Namespace functions
static int create_namespaces(int flags);
static int join_namespace(pid_t pid, const char *ns_type);
static int setup_user_namespace(container_t *container);
static int setup_network_namespace(container_t *container);
// Cgroup functions
static int create_cgroup(const char *cgroup_path, const char *controller);
static int write_cgroup_setting(const char *cgroup_path, const char *controller, const char *setting, const char *value);
static int add_process_to_cgroup(const char *cgroup_path, pid_t pid);
static int cleanup_cgroups(container_t *container);
// Network functions
static int create_network_bridge(const char *bridge_name);
static int create_veth_pair(const char *veth_host, const char *veth_container);
static int move_interface_to_namespace(const char *interface, pid_t pid);
static int configure_container_network(container_t *container, const char *veth_container);
// Security functions
static int drop_capabilities(container_t *container);
static int apply_seccomp_filter(container_t *container);
static int set_no_new_privs(void);
// Image management
static int pull_container_image(const char *image_name, const char *destination);
static int extract_rootfs(const char *image_path, const char *rootfs_path);
// Monitoring functions
static int collect_container_stats(container_t *container);
static int monitor_container_process(container_t *container);
// Utility functions
static char *generate_container_id(void);
static int make_container_directories(container_t *container);
static int write_container_state(container_t *container);
static int read_container_state(const char *container_id, container_t *container);
// Global runtime instance
static container_runtime_t g_runtime;
int main(int argc, char *argv[]) {
int result;
// Check if running as root
if (geteuid() != 0) {
fprintf(stderr, "This program must be run as root\n");
return 1;
}
// Initialize container runtime
result = runtime_init(&g_runtime);
if (result != 0) {
fprintf(stderr, "Failed to initialize container runtime\n");
return 1;
}
printf("Container runtime initialized\n");
// Example: Create and run a container
container_config_t config;
memset(&config, 0, sizeof(container_config_t));
// Basic configuration
strncpy(config.id, generate_container_id(), sizeof(config.id) - 1);
strncpy(config.name, "test-container", sizeof(config.name) - 1);
strncpy(config.image, "alpine:latest", sizeof(config.image) - 1);
strncpy(config.hostname, "container-host", sizeof(config.hostname) - 1);
// Command to run
config.command[0] = "/bin/sh";
config.command[1] = "-c";
config.command[2] = "echo 'Hello from container!' && sleep 10";
config.argc = 3;
// Resource limits
config.resources.memory_limit = 128 * 1024 * 1024; // 128MB
config.resources.cpu_shares = 512;
config.resources.pids_limit = 100;
// Create container
container_t *container;
result = runtime_create_container(&g_runtime, &config, &container);
if (result != 0) {
fprintf(stderr, "Failed to create container\n");
runtime_cleanup(&g_runtime);
return 1;
}
printf("Container created: %s\n", container->config.id);
// Start container
result = runtime_start_container(&g_runtime, container->config.id);
if (result != 0) {
fprintf(stderr, "Failed to start container\n");
runtime_cleanup(&g_runtime);
return 1;
}
printf("Container started\n");
// Wait for container to finish
sleep(2);
// List running containers
container_t *containers[MAX_CONTAINERS];
int count;
runtime_list_containers(&g_runtime, containers, &count);
printf("\nRunning containers:\n");
for (int i = 0; i < count; i++) {
printf(" - %s (%s): %s\n",
containers[i]->config.id,
containers[i]->config.name,
containers[i]->state == CONTAINER_STATE_RUNNING ? "Running" : "Stopped");
}
// Execute command in container
char *exec_cmd[] = {"/bin/echo", "Executed inside container", NULL};
result = runtime_exec_in_container(&g_runtime, container->config.id, exec_cmd);
if (result == 0) {
printf("\nCommand executed in container\n");
}
// Collect stats
collect_container_stats(container);
printf("\nContainer statistics:\n");
printf(" CPU usage: %lu ns\n", container->stats.cpu_usage_ns);
printf(" Memory usage: %lu bytes\n", container->stats.memory_usage_bytes);
// Stop container
result = runtime_stop_container(&g_runtime, container->config.id, 10);
if (result == 0) {
printf("\nContainer stopped\n");
}
// Delete container
result = runtime_delete_container(&g_runtime, container->config.id);
if (result == 0) {
printf("Container deleted\n");
}
// Cleanup
runtime_cleanup(&g_runtime);
printf("\nContainer runtime shutdown completed\n");
return 0;
}
int runtime_init(container_runtime_t *runtime) {
if (!runtime) return -1;
memset(runtime, 0, sizeof(container_runtime_t));
// Set runtime paths
strncpy(runtime->runtime_root, CONTAINER_ROOT, sizeof(runtime->runtime_root) - 1);
strncpy(runtime->cgroup_parent, "/containers", sizeof(runtime->cgroup_parent) - 1);
// Create runtime directories
mkdir(runtime->runtime_root, 0755);
char containers_dir[PATH_MAX];
snprintf(containers_dir, sizeof(containers_dir), "%s/containers", runtime->runtime_root);
mkdir(containers_dir, 0755);
// Initialize mutex
pthread_mutex_init(&runtime->containers_mutex, NULL);
// Setup network
strncpy(runtime->network.bridge_name, "container0", sizeof(runtime->network.bridge_name) - 1);
strncpy(runtime->network.subnet, "172.17.0.0/16", sizeof(runtime->network.subnet) - 1);
runtime->network.next_ip = 2; // .1 is the bridge
// Create network bridge
create_network_bridge(runtime->network.bridge_name);
// Setup cgroup hierarchies
create_cgroup(runtime->cgroup_parent, "memory");
create_cgroup(runtime->cgroup_parent, "cpu");
create_cgroup(runtime->cgroup_parent, "pids");
create_cgroup(runtime->cgroup_parent, "blkio");
return 0;
}
int runtime_create_container(container_runtime_t *runtime, container_config_t *config, container_t **container) {
if (!runtime || !config || !container) return -1;
// Allocate container structure
*container = calloc(1, sizeof(container_t));
if (!*container) return -1;
// Copy configuration
memcpy(&(*container)->config, config, sizeof(container_config_t));
// Generate container ID if not provided
if (strlen((*container)->config.id) == 0) {
strncpy((*container)->config.id, generate_container_id(),
sizeof((*container)->config.id) - 1);
}
// Set initial state
(*container)->state = CONTAINER_STATE_CREATED;
// Initialize synchronization
pthread_mutex_init(&(*container)->mutex, NULL);
pthread_cond_init(&(*container)->state_cond, NULL);
// Create container directories
make_container_directories(*container);
// Prepare rootfs
if (strlen((*container)->config.rootfs) == 0) {
snprintf((*container)->config.rootfs, sizeof((*container)->config.rootfs),
"%s/containers/%s/rootfs", runtime->runtime_root, (*container)->config.id);
}
// Extract rootfs from image
char image_path[PATH_MAX];
snprintf(image_path, sizeof(image_path), "%s/images/%s.tar",
runtime->images.image_store, (*container)->config.image);
if (access(image_path, F_OK) != 0) {
// Pull image if not exists
pull_container_image((*container)->config.image, image_path);
}
extract_rootfs(image_path, (*container)->config.rootfs);
// Add container to runtime
pthread_mutex_lock(&runtime->containers_mutex);
if (runtime->num_containers >= MAX_CONTAINERS) {
pthread_mutex_unlock(&runtime->containers_mutex);
free(*container);
return -1;
}
runtime->containers[runtime->num_containers++] = *container;
pthread_mutex_unlock(&runtime->containers_mutex);
// Write container state
write_container_state(*container);
return 0;
}
int runtime_start_container(container_runtime_t *runtime, const char *container_id) {
if (!runtime || !container_id) return -1;
// Find container
container_t *container = NULL;
pthread_mutex_lock(&runtime->containers_mutex);
for (int i = 0; i < runtime->num_containers; i++) {
if (strcmp(runtime->containers[i]->config.id, container_id) == 0) {
container = runtime->containers[i];
break;
}
}
pthread_mutex_unlock(&runtime->containers_mutex);
if (!container) return -1;
// Check state
pthread_mutex_lock(&container->mutex);
if (container->state != CONTAINER_STATE_CREATED &&
container->state != CONTAINER_STATE_STOPPED) {
pthread_mutex_unlock(&container->mutex);
return -1;
}
// Create socketpair for parent-child communication
int sockets[2];
if (socketpair(AF_UNIX, SOCK_STREAM, 0, sockets) < 0) {
pthread_mutex_unlock(&container->mutex);
return -1;
}
container->parent_socket = sockets[0];
container->child_socket = sockets[1];
// Allocate stack for child
char *child_stack = mmap(NULL, STACK_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (child_stack == MAP_FAILED) {
close(container->parent_socket);
close(container->child_socket);
pthread_mutex_unlock(&container->mutex);
return -1;
}
// Clone flags for new namespaces
int clone_flags = CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWUSER |
SIGCHLD;
// Create child process with new namespaces
container->child_pid = clone(container_child_func,
child_stack + STACK_SIZE,
clone_flags, container);
if (container->child_pid < 0) {
munmap(child_stack, STACK_SIZE);
close(container->parent_socket);
close(container->child_socket);
pthread_mutex_unlock(&container->mutex);
return -1;
}
// Close child socket in parent
close(container->child_socket);
// Setup user namespace mapping
setup_user_namespace(container);
// Signal child to continue
char sync_byte = 1;
write(container->parent_socket, &sync_byte, 1);
// Wait for child to signal it's ready
read(container->parent_socket, &sync_byte, 1);
// Setup network
setup_container_network(container);
// Add to cgroups
setup_container_cgroups(container);
// Update state
container->state = CONTAINER_STATE_RUNNING;
container->stats.start_time = time(NULL);
pthread_cond_broadcast(&container->state_cond);
pthread_mutex_unlock(&container->mutex);
// Start monitoring thread
pthread_t monitor_thread;
pthread_create(&monitor_thread, NULL,
(void *(*)(void *))monitor_container_process, container);
pthread_detach(monitor_thread);
return 0;
}
static int container_child_func(void *arg) {
container_t *container = (container_t *)arg;
char sync_byte;
// Close parent socket in child
close(container->parent_socket);
// Wait for parent to setup user namespace
read(container->child_socket, &sync_byte, 1);
// Set hostname
if (sethostname(container->config.hostname, strlen(container->config.hostname)) < 0) {
perror("sethostname");
return -1;
}
// Setup filesystem
if (setup_container_filesystem(container) != 0) {
fprintf(stderr, "Failed to setup filesystem\n");
return -1;
}
// Mount container filesystems
if (mount_container_filesystems(container) != 0) {
fprintf(stderr, "Failed to mount filesystems\n");
return -1;
}
// Pivot root to container rootfs
if (pivot_root_to_container(container->config.rootfs) != 0) {
fprintf(stderr, "Failed to pivot root\n");
return -1;
}
// Apply security settings
if (setup_container_security(container) != 0) {
fprintf(stderr, "Failed to setup security\n");
return -1;
}
// Change to working directory
if (strlen(container->config.working_dir) > 0) {
chdir(container->config.working_dir);
} else {
chdir("/");
}
// Drop privileges
if (container->config.uid > 0) {
setgid(container->config.gid);
setuid(container->config.uid);
}
// Signal parent we're ready
write(container->child_socket, &sync_byte, 1);
close(container->child_socket);
// Setup environment
clearenv();
for (int i = 0; i < container->config.env_count; i++) {
putenv(container->config.env[i]);
}
// Default environment variables
setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 1);
setenv("TERM", "xterm", 1);
setenv("CONTAINER", "1", 1);
setenv("HOSTNAME", container->config.hostname, 1);
// Execute container command
execvp(container->config.command[0], container->config.command);
// If we get here, exec failed
perror("execvp");
return -1;
}
static int setup_container_filesystem(container_t *container) {
// Mount container root as private
if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL) < 0) {
perror("mount private root");
return -1;
}
// Bind mount rootfs
if (mount(container->config.rootfs, container->config.rootfs,
"bind", MS_BIND | MS_REC, NULL) < 0) {
perror("mount bind rootfs");
return -1;
}
return 0;
}
static int mount_container_filesystems(container_t *container) {
char target[PATH_MAX];
// Mount proc
snprintf(target, sizeof(target), "%s/proc", container->config.rootfs);
mkdir(target, 0755);
if (mount("proc", target, "proc", MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL) < 0) {
perror("mount proc");
return -1;
}
// Mount sys
snprintf(target, sizeof(target), "%s/sys", container->config.rootfs);
mkdir(target, 0755);
if (mount("sysfs", target, "sysfs", MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_RDONLY, NULL) < 0) {
perror("mount sys");
return -1;
}
// Mount dev
snprintf(target, sizeof(target), "%s/dev", container->config.rootfs);
mkdir(target, 0755);
if (mount("tmpfs", target, "tmpfs", MS_NOSUID | MS_STRICTATIME, "mode=755,size=65536k") < 0) {
perror("mount dev");
return -1;
}
// Create device nodes
snprintf(target, sizeof(target), "%s/dev/null", container->config.rootfs);
mknod(target, S_IFCHR | 0666, makedev(1, 3));
snprintf(target, sizeof(target), "%s/dev/zero", container->config.rootfs);
mknod(target, S_IFCHR | 0666, makedev(1, 5));
snprintf(target, sizeof(target), "%s/dev/random", container->config.rootfs);
mknod(target, S_IFCHR | 0666, makedev(1, 8));
snprintf(target, sizeof(target), "%s/dev/urandom", container->config.rootfs);
mknod(target, S_IFCHR | 0666, makedev(1, 9));
snprintf(target, sizeof(target), "%s/dev/tty", container->config.rootfs);
mknod(target, S_IFCHR | 0666, makedev(5, 0));
// Mount devpts
snprintf(target, sizeof(target), "%s/dev/pts", container->config.rootfs);
mkdir(target, 0755);
if (mount("devpts", target, "devpts", MS_NOSUID | MS_NOEXEC,
"newinstance,ptmxmode=0666,mode=0620") < 0) {
perror("mount devpts");
return -1;
}
// Create ptmx symlink
snprintf(target, sizeof(target), "%s/dev/ptmx", container->config.rootfs);
symlink("pts/ptmx", target);
// Mount shm
snprintf(target, sizeof(target), "%s/dev/shm", container->config.rootfs);
mkdir(target, 0755);
if (mount("tmpfs", target, "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV,
"mode=1777,size=65536k") < 0) {
perror("mount shm");
return -1;
}
// Mount custom mounts
for (int i = 0; i < container->config.num_mounts; i++) {
snprintf(target, sizeof(target), "%s%s",
container->config.rootfs, container->config.mounts[i].destination);
// Create mount point
mkdir(target, 0755);
unsigned long flags = MS_BIND;
if (container->config.mounts[i].readonly) {
flags |= MS_RDONLY;
}
if (mount(container->config.mounts[i].source, target,
container->config.mounts[i].type, flags,
container->config.mounts[i].options) < 0) {
perror("mount custom");
return -1;
}
}
return 0;
}
static int pivot_root_to_container(const char *new_root) {
char old_root[PATH_MAX];
snprintf(old_root, sizeof(old_root), "%s/.old_root", new_root);
// Create directory for old root
mkdir(old_root, 0755);
// Pivot root
if (syscall(SYS_pivot_root, new_root, old_root) < 0) {
perror("pivot_root");
return -1;
}
// Change to new root
chdir("/");
// Unmount old root
if (umount2("/.old_root", MNT_DETACH) < 0) {
perror("umount old root");
return -1;
}
// Remove old root directory
rmdir("/.old_root");
return 0;
}
static int setup_container_cgroups(container_t *container) {
char cgroup_path[PATH_MAX];
char value[64];
// Create container cgroup
snprintf(cgroup_path, sizeof(cgroup_path), "%s/%s",
g_runtime.cgroup_parent, container->config.id);
// Memory cgroup
create_cgroup(cgroup_path, "memory");
if (container->config.resources.memory_limit > 0) {
snprintf(value, sizeof(value), "%ld", container->config.resources.memory_limit);
write_cgroup_setting(cgroup_path, "memory", "memory.limit_in_bytes", value);
}
if (container->config.resources.memory_swap_limit > 0) {
snprintf(value, sizeof(value), "%ld", container->config.resources.memory_swap_limit);
write_cgroup_setting(cgroup_path, "memory", "memory.memsw.limit_in_bytes", value);
}
// CPU cgroup
create_cgroup(cgroup_path, "cpu");
if (container->config.resources.cpu_shares > 0) {
snprintf(value, sizeof(value), "%d", container->config.resources.cpu_shares);
write_cgroup_setting(cgroup_path, "cpu", "cpu.shares", value);
}
if (container->config.resources.cpu_quota > 0) {
snprintf(value, sizeof(value), "%d", container->config.resources.cpu_quota);
write_cgroup_setting(cgroup_path, "cpu", "cpu.cfs_quota_us", value);
snprintf(value, sizeof(value), "%d", container->config.resources.cpu_period);
write_cgroup_setting(cgroup_path, "cpu", "cpu.cfs_period_us", value);
}
// PIDs cgroup
create_cgroup(cgroup_path, "pids");
if (container->config.resources.pids_limit > 0) {
snprintf(value, sizeof(value), "%d", container->config.resources.pids_limit);
write_cgroup_setting(cgroup_path, "pids", "pids.max", value);
}
// Add process to cgroups
add_process_to_cgroup(cgroup_path, container->child_pid);
return 0;
}
static int setup_container_network(container_t *container) {
char veth_host[64], veth_container[64];
// Generate veth pair names
snprintf(veth_host, sizeof(veth_host), "veth%s", container->config.id);
snprintf(veth_container, sizeof(veth_container), "eth0");
// Create veth pair
if (create_veth_pair(veth_host, veth_container) != 0) {
return -1;
}
// Move container end to container namespace
if (move_interface_to_namespace(veth_container, container->child_pid) != 0) {
return -1;
}
// Attach host end to bridge
char cmd[512];
snprintf(cmd, sizeof(cmd), "ip link set %s master %s",
veth_host, g_runtime.network.bridge_name);
system(cmd);
// Bring up host end
snprintf(cmd, sizeof(cmd), "ip link set %s up", veth_host);
system(cmd);
// Configure container network (will be done inside container namespace)
// This is handled by the container process
return 0;
}
static int setup_container_security(container_t *container) {
// Set no new privileges
if (set_no_new_privs() != 0) {
return -1;
}
// Drop capabilities
if (!container->config.security.privileged) {
if (drop_capabilities(container) != 0) {
return -1;
}
}
// Apply seccomp filter
if (strlen(container->config.security.seccomp_profile) > 0) {
if (apply_seccomp_filter(container) != 0) {
return -1;
}
}
return 0;
}
static int drop_capabilities(container_t *container) {
// Default capability set for unprivileged containers
int default_caps[] = {
CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_FSETID, CAP_FOWNER,
CAP_MKNOD, CAP_NET_RAW, CAP_SETGID, CAP_SETUID,
CAP_SETFCAP, CAP_SETPCAP, CAP_NET_BIND_SERVICE,
CAP_SYS_CHROOT, CAP_KILL, CAP_AUDIT_WRITE
};
int num_default_caps = sizeof(default_caps) / sizeof(default_caps[0]);
// Clear all capabilities
for (int i = 0; i <= CAP_LAST_CAP; i++) {
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0) < 0) {
if (errno != EINVAL) {
perror("prctl PR_CAPBSET_DROP");
return -1;
}
}
}
// Add back allowed capabilities
cap_t caps = cap_init();
cap_value_t cap_list[64];
int cap_count = 0;
// Add default capabilities
for (int i = 0; i < num_default_caps; i++) {
cap_list[cap_count++] = default_caps[i];
}
// Add custom capabilities
for (int i = 0; i < container->config.security.num_capabilities; i++) {
cap_list[cap_count++] = container->config.security.capabilities[i];
}
cap_set_flag(caps, CAP_EFFECTIVE, cap_count, cap_list, CAP_SET);
cap_set_flag(caps, CAP_PERMITTED, cap_count, cap_list, CAP_SET);
cap_set_flag(caps, CAP_INHERITABLE, cap_count, cap_list, CAP_SET);
if (cap_set_proc(caps) < 0) {
perror("cap_set_proc");
cap_free(caps);
return -1;
}
cap_free(caps);
return 0;
}
static int set_no_new_privs(void) {
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
perror("prctl PR_SET_NO_NEW_PRIVS");
return -1;
}
return 0;
}
static char *generate_container_id(void) {
static char id[65];
unsigned char random_bytes[32];
// Read random bytes
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0 || read(fd, random_bytes, sizeof(random_bytes)) != sizeof(random_bytes)) {
// Fallback to simple timestamp
snprintf(id, sizeof(id), "container_%ld", time(NULL));
if (fd >= 0) close(fd);
return id;
}
close(fd);
// Convert to hex string
for (int i = 0; i < 32; i++) {
sprintf(&id[i * 2], "%02x", random_bytes[i]);
}
return id;
}
static int create_cgroup(const char *cgroup_path, const char *controller) {
char path[PATH_MAX];
snprintf(path, sizeof(path), "%s/%s%s", CGROUP_PATH, controller, cgroup_path);
if (mkdir(path, 0755) < 0 && errno != EEXIST) {
perror("mkdir cgroup");
return -1;
}
return 0;
}
static int write_cgroup_setting(const char *cgroup_path, const char *controller,
const char *setting, const char *value) {
char path[PATH_MAX];
snprintf(path, sizeof(path), "%s/%s%s/%s",
CGROUP_PATH, controller, cgroup_path, setting);
int fd = open(path, O_WRONLY);
if (fd < 0) {
perror("open cgroup setting");
return -1;
}
if (write(fd, value, strlen(value)) < 0) {
perror("write cgroup setting");
close(fd);
return -1;
}
close(fd);
return 0;
}
void runtime_cleanup(container_runtime_t *runtime) {
if (!runtime) return;
// Stop all containers
for (int i = 0; i < runtime->num_containers; i++) {
if (runtime->containers[i]->state == CONTAINER_STATE_RUNNING) {
runtime_stop_container(runtime, runtime->containers[i]->config.id, 10);
}
// Free container resources
pthread_mutex_destroy(&runtime->containers[i]->mutex);
pthread_cond_destroy(&runtime->containers[i]->state_cond);
free(runtime->containers[i]);
}
pthread_mutex_destroy(&runtime->containers_mutex);
}
This comprehensive container runtime programming guide provides:
- Complete Container Runtime: Full implementation with namespaces, cgroups, and lifecycle management
- Namespace Isolation: PID, network, mount, UTS, IPC, user, and cgroup namespaces
- Resource Management: CPU, memory, I/O, and PID limits using cgroups
- Container Networking: Virtual ethernet pairs, network bridges, and port mapping
- Security Features: Capabilities dropping, seccomp filters, and AppArmor/SELinux support
- Image Management: Container image pulling, extraction, and rootfs setup
- OCI Compatibility: Support for OCI runtime spec
- Monitoring and Statistics: Resource usage tracking and performance monitoring
The code demonstrates advanced container runtime programming techniques essential for building container platforms and orchestration systems.