Container Internals Deep Dive: Namespaces, Cgroups, and Runtime Implementation
Containers have revolutionized application deployment and isolation, but their magic lies in fundamental Linux kernel features. Understanding namespaces, cgroups, and container runtimes at a deep level is essential for building secure, efficient containerized systems and troubleshooting complex container environments.
Container Internals Deep Dive
Linux Namespaces: The Foundation of Isolation
Understanding Namespace Types
// namespace_demo.c - Linux namespace programming
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <fcntl.h>
// Namespace types and their purposes
typedef struct {
int flag;
const char* name;
const char* description;
} namespace_info_t;
static namespace_info_t namespaces[] = {
{CLONE_NEWPID, "pid", "Process ID isolation"},
{CLONE_NEWNET, "net", "Network stack isolation"},
{CLONE_NEWUTS, "uts", "Hostname and NIS domain isolation"},
{CLONE_NEWIPC, "ipc", "System V IPC isolation"},
{CLONE_NEWNS, "mnt", "Mount point isolation"},
{CLONE_NEWUSER, "user", "User and group ID isolation"},
{CLONE_NEWCGROUP, "cgroup", "Cgroup root directory isolation"},
{CLONE_NEWTIME, "time", "Boot and monotonic clock isolation"}
};
// Create a new namespace
int create_namespace(int namespace_flags, int (*child_func)(void*), void* arg) {
const int STACK_SIZE = 1024 * 1024;
char* stack = malloc(STACK_SIZE);
char* stack_top = stack + STACK_SIZE;
if (!stack) {
perror("malloc");
return -1;
}
pid_t child_pid = clone(child_func, stack_top, namespace_flags | SIGCHLD, arg);
if (child_pid == -1) {
perror("clone");
free(stack);
return -1;
}
// Wait for child
int status;
waitpid(child_pid, &status, 0);
free(stack);
return WEXITSTATUS(status);
}
// Demonstrate PID namespace
int pid_namespace_demo(void* arg) {
printf("Inside PID namespace:\n");
printf(" PID: %d (should be 1)\n", getpid());
printf(" PPID: %d (should be 0)\n", getppid());
// Show process list
system("ps aux | head -10");
return 0;
}
// Demonstrate UTS namespace
int uts_namespace_demo(void* arg) {
const char* new_hostname = "container-host";
printf("Original hostname: ");
system("hostname");
if (sethostname(new_hostname, strlen(new_hostname)) == -1) {
perror("sethostname");
return 1;
}
printf("New hostname: ");
system("hostname");
return 0;
}
// Demonstrate mount namespace
int mount_namespace_demo(void* arg) {
// Create a temporary directory
if (mkdir("/tmp/container_root", 0755) == -1 && errno != EEXIST) {
perror("mkdir");
return 1;
}
// Mount tmpfs as new root
if (mount("tmpfs", "/tmp/container_root", "tmpfs", 0, "size=100m") == -1) {
perror("mount tmpfs");
return 1;
}
// Create basic directory structure
mkdir("/tmp/container_root/bin", 0755);
mkdir("/tmp/container_root/usr", 0755);
mkdir("/tmp/container_root/etc", 0755);
// Change root to new filesystem
if (chroot("/tmp/container_root") == -1) {
perror("chroot");
return 1;
}
if (chdir("/") == -1) {
perror("chdir");
return 1;
}
printf("Inside mount namespace:\n");
system("ls -la /");
return 0;
}
// Network namespace utilities
void setup_loopback_interface() {
// Bring up loopback interface in new network namespace
system("ip link set dev lo up");
system("ip addr add 127.0.0.1/8 dev lo");
}
int network_namespace_demo(void* arg) {
printf("Network interfaces in new namespace:\n");
system("ip link show");
setup_loopback_interface();
printf("\nAfter setting up loopback:\n");
system("ip link show");
system("ip addr show");
return 0;
}
// User namespace mapping
void setup_user_namespace_mappings(pid_t child_pid) {
char path[256];
FILE* file;
// Map root user
snprintf(path, sizeof(path), "/proc/%d/uid_map", child_pid);
file = fopen(path, "w");
if (file) {
fprintf(file, "0 %d 1", getuid());
fclose(file);
}
// Deny setgroups
snprintf(path, sizeof(path), "/proc/%d/setgroups", child_pid);
file = fopen(path, "w");
if (file) {
fprintf(file, "deny");
fclose(file);
}
// Map root group
snprintf(path, sizeof(path), "/proc/%d/gid_map", child_pid);
file = fopen(path, "w");
if (file) {
fprintf(file, "0 %d 1", getgid());
fclose(file);
}
}
int user_namespace_demo(void* arg) {
printf("User namespace demo:\n");
printf(" UID: %d (should be 0)\n", getuid());
printf(" GID: %d (should be 0)\n", getgid());
printf(" EUID: %d (should be 0)\n", geteuid());
printf(" EGID: %d (should be 0)\n", getegid());
// Try to access /etc/shadow (should fail)
printf(" Trying to read /etc/shadow: ");
if (access("/etc/shadow", R_OK) == 0) {
printf("SUCCESS (unexpected!)\n");
} else {
printf("FAILED (expected)\n");
}
return 0;
}
Advanced Namespace Operations
// namespace_advanced.c - Advanced namespace management
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
// Join existing namespace
int join_namespace(pid_t target_pid, const char* ns_type) {
char ns_path[256];
snprintf(ns_path, sizeof(ns_path), "/proc/%d/ns/%s", target_pid, ns_type);
int fd = open(ns_path, O_RDONLY);
if (fd == -1) {
perror("open namespace");
return -1;
}
if (setns(fd, 0) == -1) {
perror("setns");
close(fd);
return -1;
}
close(fd);
return 0;
}
// Create persistent namespace
int create_persistent_namespace(const char* name, int ns_flags) {
char bind_path[256];
snprintf(bind_path, sizeof(bind_path), "/tmp/ns_%s", name);
// Create bind mount target
int fd = open(bind_path, O_RDONLY | O_CREAT, 0644);
if (fd == -1) {
perror("create bind target");
return -1;
}
close(fd);
// Unshare namespace
if (unshare(ns_flags) == -1) {
perror("unshare");
return -1;
}
// Bind mount namespace file
char current_ns[256];
snprintf(current_ns, sizeof(current_ns), "/proc/%d/ns/net", getpid());
if (mount(current_ns, bind_path, NULL, MS_BIND, NULL) == -1) {
perror("bind mount namespace");
return -1;
}
printf("Created persistent namespace: %s\n", bind_path);
return 0;
}
// Container-style namespace setup
typedef struct {
char* hostname;
char* root_path;
int enable_networking;
} container_config_t;
int setup_container_namespaces(container_config_t* config) {
// Unshare all namespaces except user (for simplicity)
int ns_flags = CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWUTS |
CLONE_NEWIPC | CLONE_NEWNS;
if (unshare(ns_flags) == -1) {
perror("unshare namespaces");
return -1;
}
// Set hostname
if (config->hostname &&
sethostname(config->hostname, strlen(config->hostname)) == -1) {
perror("sethostname");
return -1;
}
// Setup mount namespace
if (config->root_path) {
// Mount new root
if (mount(config->root_path, config->root_path, NULL, MS_BIND, NULL) == -1) {
perror("bind mount root");
return -1;
}
// Change root
if (chdir(config->root_path) == -1) {
perror("chdir to new root");
return -1;
}
if (chroot(".") == -1) {
perror("chroot");
return -1;
}
if (chdir("/") == -1) {
perror("chdir to /");
return -1;
}
}
// Setup networking if requested
if (config->enable_networking) {
setup_loopback_interface();
}
return 0;
}
Control Groups (Cgroups): Resource Management
Cgroups v1 Implementation
// cgroups_v1.c - Cgroups v1 resource management
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#define CGROUP_MOUNT "/sys/fs/cgroup"
typedef struct {
char name[256];
long memory_limit; // bytes
long cpu_shares; // relative weight
long cpu_quota; // microseconds per period
long cpu_period; // microseconds
char* allowed_devices; // device whitelist
} cgroup_config_t;
// Write value to cgroup file
int write_cgroup_file(const char* controller, const char* cgroup_name,
const char* file, const char* value) {
char path[512];
snprintf(path, sizeof(path), "%s/%s/%s/%s",
CGROUP_MOUNT, controller, cgroup_name, file);
int fd = open(path, O_WRONLY);
if (fd == -1) {
perror("open cgroup file");
return -1;
}
if (write(fd, value, strlen(value)) == -1) {
perror("write cgroup file");
close(fd);
return -1;
}
close(fd);
return 0;
}
// Read value from cgroup file
int read_cgroup_file(const char* controller, const char* cgroup_name,
const char* file, char* buffer, size_t size) {
char path[512];
snprintf(path, sizeof(path), "%s/%s/%s/%s",
CGROUP_MOUNT, controller, cgroup_name, file);
int fd = open(path, O_RDONLY);
if (fd == -1) {
perror("open cgroup file");
return -1;
}
ssize_t bytes = read(fd, buffer, size - 1);
if (bytes == -1) {
perror("read cgroup file");
close(fd);
return -1;
}
buffer[bytes] = '\0';
close(fd);
return 0;
}
// Create cgroup
int create_cgroup(const char* controller, const char* cgroup_name) {
char path[512];
snprintf(path, sizeof(path), "%s/%s/%s",
CGROUP_MOUNT, controller, cgroup_name);
if (mkdir(path, 0755) == -1 && errno != EEXIST) {
perror("mkdir cgroup");
return -1;
}
return 0;
}
// Apply cgroup configuration
int apply_cgroup_config(cgroup_config_t* config) {
char value[256];
// Create cgroups for each controller
create_cgroup("memory", config->name);
create_cgroup("cpu", config->name);
create_cgroup("devices", config->name);
// Set memory limit
if (config->memory_limit > 0) {
snprintf(value, sizeof(value), "%ld", config->memory_limit);
write_cgroup_file("memory", config->name, "memory.limit_in_bytes", value);
// Disable swap for container
write_cgroup_file("memory", config->name, "memory.swappiness", "0");
// Set OOM killer behavior
write_cgroup_file("memory", config->name, "memory.oom_control", "1");
}
// Set CPU limits
if (config->cpu_shares > 0) {
snprintf(value, sizeof(value), "%ld", config->cpu_shares);
write_cgroup_file("cpu", config->name, "cpu.shares", value);
}
if (config->cpu_quota > 0 && config->cpu_period > 0) {
snprintf(value, sizeof(value), "%ld", config->cpu_period);
write_cgroup_file("cpu", config->name, "cpu.cfs_period_us", value);
snprintf(value, sizeof(value), "%ld", config->cpu_quota);
write_cgroup_file("cpu", config->name, "cpu.cfs_quota_us", value);
}
// Set device restrictions
if (config->allowed_devices) {
// Deny all devices first
write_cgroup_file("devices", config->name, "devices.deny", "a");
// Allow specific devices
write_cgroup_file("devices", config->name, "devices.allow", config->allowed_devices);
}
return 0;
}
// Add process to cgroup
int add_process_to_cgroup(const char* controller, const char* cgroup_name, pid_t pid) {
char value[64];
snprintf(value, sizeof(value), "%d", pid);
return write_cgroup_file(controller, cgroup_name, "cgroup.procs", value);
}
// Monitor cgroup resource usage
void monitor_cgroup_usage(const char* cgroup_name) {
char buffer[1024];
printf("=== Cgroup Resource Usage: %s ===\n", cgroup_name);
// Memory usage
if (read_cgroup_file("memory", cgroup_name, "memory.usage_in_bytes",
buffer, sizeof(buffer)) == 0) {
long usage = strtol(buffer, NULL, 10);
printf("Memory usage: %ld bytes (%.2f MB)\n", usage, usage / 1024.0 / 1024.0);
}
// Memory limit
if (read_cgroup_file("memory", cgroup_name, "memory.limit_in_bytes",
buffer, sizeof(buffer)) == 0) {
long limit = strtol(buffer, NULL, 10);
printf("Memory limit: %ld bytes (%.2f MB)\n", limit, limit / 1024.0 / 1024.0);
}
// CPU usage
if (read_cgroup_file("cpu", cgroup_name, "cpuacct.usage",
buffer, sizeof(buffer)) == 0) {
long usage = strtol(buffer, NULL, 10);
printf("CPU usage: %ld nanoseconds (%.2f seconds)\n", usage, usage / 1e9);
}
// Process count
if (read_cgroup_file("memory", cgroup_name, "cgroup.procs",
buffer, sizeof(buffer)) == 0) {
int count = 0;
char* line = strtok(buffer, "\n");
while (line) {
count++;
line = strtok(NULL, "\n");
}
printf("Process count: %d\n", count);
}
}
Cgroups v2 (Unified Hierarchy)
// cgroups_v2.c - Cgroups v2 unified hierarchy
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define CGROUP_V2_MOUNT "/sys/fs/cgroup"
typedef struct {
char name[256];
char memory_max[64]; // "100M", "1G", "max"
char memory_high[64]; // soft limit
char cpu_max[64]; // "50000 100000" (quota period)
int cpu_weight; // 1-10000
char io_max[256]; // "8:16 rbps=2097152 wbps=1048576"
} cgroup_v2_config_t;
// Write to cgroup v2 file
int write_cgroup_v2_file(const char* cgroup_name, const char* file, const char* value) {
char path[512];
snprintf(path, sizeof(path), "%s/%s/%s", CGROUP_V2_MOUNT, cgroup_name, file);
FILE* fp = fopen(path, "w");
if (!fp) {
perror("fopen cgroup v2 file");
return -1;
}
if (fprintf(fp, "%s", value) < 0) {
perror("write cgroup v2 file");
fclose(fp);
return -1;
}
fclose(fp);
return 0;
}
// Create cgroup v2
int create_cgroup_v2(const char* cgroup_name) {
char path[512];
snprintf(path, sizeof(path), "%s/%s", CGROUP_V2_MOUNT, cgroup_name);
if (mkdir(path, 0755) == -1 && errno != EEXIST) {
perror("mkdir cgroup v2");
return -1;
}
// Enable controllers
write_cgroup_v2_file(cgroup_name, "cgroup.subtree_control",
"+cpu +memory +io +pids");
return 0;
}
// Apply cgroup v2 configuration
int apply_cgroup_v2_config(cgroup_v2_config_t* config) {
create_cgroup_v2(config->name);
// Memory limits
if (strlen(config->memory_max) > 0) {
write_cgroup_v2_file(config->name, "memory.max", config->memory_max);
}
if (strlen(config->memory_high) > 0) {
write_cgroup_v2_file(config->name, "memory.high", config->memory_high);
}
// CPU limits
if (strlen(config->cpu_max) > 0) {
write_cgroup_v2_file(config->name, "cpu.max", config->cpu_max);
}
if (config->cpu_weight > 0) {
char weight[64];
snprintf(weight, sizeof(weight), "%d", config->cpu_weight);
write_cgroup_v2_file(config->name, "cpu.weight", weight);
}
// IO limits
if (strlen(config->io_max) > 0) {
write_cgroup_v2_file(config->name, "io.max", config->io_max);
}
return 0;
}
// Advanced cgroup v2 features
void setup_memory_events(const char* cgroup_name) {
// Set up memory pressure events
write_cgroup_v2_file(cgroup_name, "memory.events", "");
// Configure pressure stall information
write_cgroup_v2_file(cgroup_name, "cgroup.pressure", "1");
}
// Monitor cgroup v2 statistics
void monitor_cgroup_v2_stats(const char* cgroup_name) {
char path[512];
FILE* fp;
char line[256];
printf("=== Cgroup v2 Statistics: %s ===\n", cgroup_name);
// Memory stats
snprintf(path, sizeof(path), "%s/%s/memory.stat", CGROUP_V2_MOUNT, cgroup_name);
fp = fopen(path, "r");
if (fp) {
printf("\nMemory Statistics:\n");
while (fgets(line, sizeof(line), fp)) {
if (strstr(line, "anon") || strstr(line, "file") ||
strstr(line, "kernel") || strstr(line, "sock")) {
printf(" %s", line);
}
}
fclose(fp);
}
// CPU stats
snprintf(path, sizeof(path), "%s/%s/cpu.stat", CGROUP_V2_MOUNT, cgroup_name);
fp = fopen(path, "r");
if (fp) {
printf("\nCPU Statistics:\n");
while (fgets(line, sizeof(line), fp)) {
printf(" %s", line);
}
fclose(fp);
}
// IO stats
snprintf(path, sizeof(path), "%s/%s/io.stat", CGROUP_V2_MOUNT, cgroup_name);
fp = fopen(path, "r");
if (fp) {
printf("\nIO Statistics:\n");
while (fgets(line, sizeof(line), fp)) {
printf(" %s", line);
}
fclose(fp);
}
// Pressure information
snprintf(path, sizeof(path), "%s/%s/memory.pressure", CGROUP_V2_MOUNT, cgroup_name);
fp = fopen(path, "r");
if (fp) {
printf("\nMemory Pressure:\n");
while (fgets(line, sizeof(line), fp)) {
printf(" %s", line);
}
fclose(fp);
}
}
Building a Container Runtime
Simple Container Implementation
// simple_container.c - Basic container runtime
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/mount.h>
#include <sys/capability.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct {
char* name;
char* image_path;
char* command;
char** args;
char** env;
// Resource limits
long memory_limit;
long cpu_shares;
// Capabilities
cap_value_t* capabilities;
int num_capabilities;
// Networking
int enable_networking;
char* ip_address;
// Security
int read_only_root;
char** bind_mounts;
} container_spec_t;
// Capability dropping
int drop_capabilities(cap_value_t* keep_caps, int num_caps) {
cap_t caps = cap_get_proc();
if (!caps) {
perror("cap_get_proc");
return -1;
}
// Clear all capabilities
if (cap_clear(caps) == -1) {
perror("cap_clear");
cap_free(caps);
return -1;
}
// Set only allowed capabilities
if (num_caps > 0) {
if (cap_set_flag(caps, CAP_EFFECTIVE, num_caps, keep_caps, CAP_SET) == -1 ||
cap_set_flag(caps, CAP_PERMITTED, num_caps, keep_caps, CAP_SET) == -1) {
perror("cap_set_flag");
cap_free(caps);
return -1;
}
}
// Apply capabilities
if (cap_set_proc(caps) == -1) {
perror("cap_set_proc");
cap_free(caps);
return -1;
}
cap_free(caps);
return 0;
}
// Setup container filesystem
int setup_container_fs(container_spec_t* spec) {
// Create container root directory
char container_root[256];
snprintf(container_root, sizeof(container_root), "/tmp/container_%s", spec->name);
if (mkdir(container_root, 0755) == -1 && errno != EEXIST) {
perror("mkdir container root");
return -1;
}
// Mount container image
if (mount(spec->image_path, container_root, NULL, MS_BIND, NULL) == -1) {
perror("mount container image");
return -1;
}
// Make read-only if requested
if (spec->read_only_root) {
if (mount(NULL, container_root, NULL, MS_REMOUNT | MS_RDONLY | MS_BIND, NULL) == -1) {
perror("remount read-only");
return -1;
}
}
// Setup bind mounts
if (spec->bind_mounts) {
for (int i = 0; spec->bind_mounts[i]; i++) {
char* mount_spec = strdup(spec->bind_mounts[i]);
char* src = strtok(mount_spec, ":");
char* dst = strtok(NULL, ":");
char full_dst[512];
snprintf(full_dst, sizeof(full_dst), "%s%s", container_root, dst);
// Create destination directory
mkdir(full_dst, 0755);
if (mount(src, full_dst, NULL, MS_BIND, NULL) == -1) {
perror("bind mount");
free(mount_spec);
return -1;
}
free(mount_spec);
}
}
// Setup essential filesystems
char proc_path[512], sys_path[512], dev_path[512];
snprintf(proc_path, sizeof(proc_path), "%s/proc", container_root);
snprintf(sys_path, sizeof(sys_path), "%s/sys", container_root);
snprintf(dev_path, sizeof(dev_path), "%s/dev", container_root);
mkdir(proc_path, 0755);
mkdir(sys_path, 0755);
mkdir(dev_path, 0755);
mount("proc", proc_path, "proc", 0, NULL);
mount("sysfs", sys_path, "sysfs", 0, NULL);
mount("tmpfs", dev_path, "tmpfs", 0, "size=1m");
// Create essential device nodes
char dev_null[512], dev_zero[512];
snprintf(dev_null, sizeof(dev_null), "%s/dev/null", container_root);
snprintf(dev_zero, sizeof(dev_zero), "%s/dev/zero", container_root);
mknod(dev_null, S_IFCHR | 0666, makedev(1, 3));
mknod(dev_zero, S_IFCHR | 0666, makedev(1, 5));
// Change root
if (chdir(container_root) == -1) {
perror("chdir");
return -1;
}
if (chroot(".") == -1) {
perror("chroot");
return -1;
}
if (chdir("/") == -1) {
perror("chdir to /");
return -1;
}
return 0;
}
// Container main function
int container_main(void* arg) {
container_spec_t* spec = (container_spec_t*)arg;
// Setup cgroup
cgroup_config_t cgroup_config = {0};
strncpy(cgroup_config.name, spec->name, sizeof(cgroup_config.name) - 1);
cgroup_config.memory_limit = spec->memory_limit;
cgroup_config.cpu_shares = spec->cpu_shares;
apply_cgroup_config(&cgroup_config);
add_process_to_cgroup("memory", spec->name, getpid());
add_process_to_cgroup("cpu", spec->name, getpid());
// Setup hostname
if (sethostname(spec->name, strlen(spec->name)) == -1) {
perror("sethostname");
return 1;
}
// Setup filesystem
if (setup_container_fs(spec) == -1) {
return 1;
}
// Setup networking
if (spec->enable_networking) {
setup_loopback_interface();
if (spec->ip_address) {
char cmd[256];
snprintf(cmd, sizeof(cmd), "ip addr add %s dev lo", spec->ip_address);
system(cmd);
}
}
// Drop capabilities
if (drop_capabilities(spec->capabilities, spec->num_capabilities) == -1) {
return 1;
}
// Execute command
if (spec->env) {
execvpe(spec->command, spec->args, spec->env);
} else {
execvp(spec->command, spec->args);
}
perror("execvp");
return 1;
}
// Run container
int run_container(container_spec_t* spec) {
const int STACK_SIZE = 1024 * 1024;
char* stack = malloc(STACK_SIZE);
if (!stack) {
perror("malloc");
return -1;
}
// Create namespaces
int flags = CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWUTS |
CLONE_NEWIPC | CLONE_NEWNS | CLONE_NEWUSER;
pid_t container_pid = clone(container_main, stack + STACK_SIZE,
flags | SIGCHLD, spec);
if (container_pid == -1) {
perror("clone");
free(stack);
return -1;
}
// Setup user namespace mappings
setup_user_namespace_mappings(container_pid);
printf("Container %s started with PID %d\n", spec->name, container_pid);
// Wait for container
int status;
waitpid(container_pid, &status, 0);
free(stack);
return WEXITSTATUS(status);
}
Container Image Management
// container_image.c - Container image handling
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <archive.h>
#include <archive_entry.h>
typedef struct {
char name[256];
char tag[64];
char digest[128];
size_t size;
time_t created;
char* config_json;
} container_image_t;
// Extract container image (tar format)
int extract_container_image(const char* image_path, const char* extract_path) {
struct archive* a;
struct archive* ext;
struct archive_entry* entry;
int r;
a = archive_read_new();
archive_read_support_filter_gzip(a);
archive_read_support_format_tar(a);
ext = archive_write_disk_new();
archive_write_disk_set_options(ext, ARCHIVE_EXTRACT_TIME | ARCHIVE_EXTRACT_PERM);
if ((r = archive_read_open_filename(a, image_path, 10240))) {
fprintf(stderr, "Error opening image: %s\n", archive_error_string(a));
return -1;
}
// Change to extraction directory
if (chdir(extract_path) != 0) {
perror("chdir");
return -1;
}
while (archive_read_next_header(a, &entry) == ARCHIVE_OK) {
const char* current_file = archive_entry_pathname(entry);
printf("Extracting: %s\n", current_file);
// Set full path
char full_path[512];
snprintf(full_path, sizeof(full_path), "%s/%s", extract_path, current_file);
archive_entry_set_pathname(entry, full_path);
r = archive_write_header(ext, entry);
if (r < ARCHIVE_OK) {
fprintf(stderr, "Warning: %s\n", archive_error_string(ext));
}
if (archive_entry_size(entry) > 0) {
copy_data(a, ext);
}
r = archive_write_finish_entry(ext);
if (r < ARCHIVE_OK) {
fprintf(stderr, "Warning: %s\n", archive_error_string(ext));
}
}
archive_read_close(a);
archive_read_free(a);
archive_write_close(ext);
archive_write_free(ext);
return 0;
}
// Copy data between archives
static int copy_data(struct archive* ar, struct archive* aw) {
int r;
const void* buff;
size_t size;
la_int64_t offset;
for (;;) {
r = archive_read_data_block(ar, &buff, &size, &offset);
if (r == ARCHIVE_EOF)
return ARCHIVE_OK;
if (r < ARCHIVE_OK)
return r;
r = archive_write_data_block(aw, buff, size, offset);
if (r < ARCHIVE_OK) {
fprintf(stderr, "Error: %s\n", archive_error_string(aw));
return r;
}
}
}
// Create container filesystem layers
int create_overlay_filesystem(const char* lower_dir, const char* upper_dir,
const char* work_dir, const char* merged_dir) {
char mount_options[1024];
snprintf(mount_options, sizeof(mount_options),
"lowerdir=%s,upperdir=%s,workdir=%s",
lower_dir, upper_dir, work_dir);
// Create directories
mkdir(upper_dir, 0755);
mkdir(work_dir, 0755);
mkdir(merged_dir, 0755);
// Mount overlay filesystem
if (mount("overlay", merged_dir, "overlay", 0, mount_options) == -1) {
perror("mount overlay");
return -1;
}
printf("Overlay filesystem mounted at %s\n", merged_dir);
return 0;
}
// Container registry interaction
typedef struct {
char registry[256];
char username[128];
char password[128];
char auth_token[512];
} registry_config_t;
// Simple HTTP client for registry operations
int download_image_manifest(registry_config_t* config,
const char* image_name, const char* tag,
char* manifest_buffer, size_t buffer_size) {
// This would implement actual HTTP client
// For demonstration, we'll use curl system call
char cmd[1024];
snprintf(cmd, sizeof(cmd),
"curl -s -H 'Accept: application/vnd.docker.distribution.manifest.v2+json' "
"-H 'Authorization: Bearer %s' "
"https://%s/v2/%s/manifests/%s",
config->auth_token, config->registry, image_name, tag);
FILE* fp = popen(cmd, "r");
if (!fp) {
perror("popen");
return -1;
}
size_t read = fread(manifest_buffer, 1, buffer_size - 1, fp);
manifest_buffer[read] = '\0';
int status = pclose(fp);
return WEXITSTATUS(status);
}
Advanced Container Features
Container Networking
// container_networking.c - Advanced container networking
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <sys/socket.h>
#include <net/if.h>
typedef struct {
char name[32];
char ip_address[64];
char gateway[64];
int mtu;
} veth_config_t;
// Create veth pair
int create_veth_pair(const char* veth1, const char* veth2) {
// This would use netlink to create veth pair
char cmd[256];
snprintf(cmd, sizeof(cmd), "ip link add %s type veth peer name %s", veth1, veth2);
if (system(cmd) != 0) {
fprintf(stderr, "Failed to create veth pair\n");
return -1;
}
return 0;
}
// Move interface to namespace
int move_interface_to_namespace(const char* interface, pid_t target_pid) {
char cmd[256];
snprintf(cmd, sizeof(cmd), "ip link set %s netns %d", interface, target_pid);
if (system(cmd) != 0) {
fprintf(stderr, "Failed to move interface to namespace\n");
return -1;
}
return 0;
}
// Setup container bridge networking
int setup_bridge_networking(const char* bridge_name,
const char* container_veth,
const char* host_veth,
veth_config_t* config) {
char cmd[512];
// Create bridge if it doesn't exist
snprintf(cmd, sizeof(cmd), "ip link add %s type bridge 2>/dev/null || true", bridge_name);
system(cmd);
// Bring up bridge
snprintf(cmd, sizeof(cmd), "ip link set %s up", bridge_name);
system(cmd);
// Create veth pair
create_veth_pair(container_veth, host_veth);
// Add host veth to bridge
snprintf(cmd, sizeof(cmd), "ip link set %s master %s", host_veth, bridge_name);
system(cmd);
// Bring up host veth
snprintf(cmd, sizeof(cmd), "ip link set %s up", host_veth);
system(cmd);
return 0;
}
// Configure container network interface
int configure_container_interface(veth_config_t* config) {
char cmd[256];
// Bring up interface
snprintf(cmd, sizeof(cmd), "ip link set %s up", config->name);
system(cmd);
// Set IP address
snprintf(cmd, sizeof(cmd), "ip addr add %s dev %s", config->ip_address, config->name);
system(cmd);
// Set MTU
if (config->mtu > 0) {
snprintf(cmd, sizeof(cmd), "ip link set %s mtu %d", config->name, config->mtu);
system(cmd);
}
// Set default route
if (strlen(config->gateway) > 0) {
snprintf(cmd, sizeof(cmd), "ip route add default via %s", config->gateway);
system(cmd);
}
return 0;
}
Container Security
// container_security.c - Advanced container security
#include <sys/prctl.h>
#include <linux/securebits.h>
#include <linux/seccomp.h>
#include <linux/filter.h>
// Seccomp filter for containers
struct sock_filter seccomp_filter[] = {
// Allow basic syscalls
BPF_STMT(BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, nr)),
// Allow read/write/open
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_read, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_write, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_openat, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
// Deny dangerous syscalls
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_ptrace, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_reboot, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_kexec_load, 0, 1),
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
// Default allow
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
};
// Apply seccomp filter
int apply_seccomp_filter() {
struct sock_fprog prog = {
.len = sizeof(seccomp_filter) / sizeof(seccomp_filter[0]),
.filter = seccomp_filter,
};
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
perror("prctl NO_NEW_PRIVS");
return -1;
}
if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == -1) {
perror("prctl SECCOMP");
return -1;
}
return 0;
}
// Setup security policies
int setup_container_security() {
// Drop all capabilities except basic ones
cap_value_t keep_caps[] = {
CAP_CHOWN,
CAP_DAC_OVERRIDE,
CAP_FOWNER,
CAP_SETGID,
CAP_SETUID,
};
drop_capabilities(keep_caps, sizeof(keep_caps) / sizeof(keep_caps[0]));
// Apply seccomp filter
apply_seccomp_filter();
// Disable core dumps
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) == -1) {
perror("prctl DUMPABLE");
return -1;
}
// Set process death signal
if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) == -1) {
perror("prctl PDEATHSIG");
return -1;
}
return 0;
}
Container Orchestration Basics
Container Management
#!/bin/bash
# container_manager.sh - Simple container orchestration
CONTAINER_DIR="/tmp/containers"
BRIDGE_NAME="container0"
# Initialize container environment
init_container_env() {
mkdir -p $CONTAINER_DIR/{running,stopped,images}
# Setup bridge network
if ! ip link show $BRIDGE_NAME &>/dev/null; then
ip link add $BRIDGE_NAME type bridge
ip addr add 172.17.0.1/16 dev $BRIDGE_NAME
ip link set $BRIDGE_NAME up
# Enable IP forwarding
echo 1 > /proc/sys/net/ipv4/ip_forward
# Setup iptables rules
iptables -t nat -A POSTROUTING -s 172.17.0.0/16 ! -o $BRIDGE_NAME -j MASQUERADE
iptables -A FORWARD -i $BRIDGE_NAME -o $BRIDGE_NAME -j ACCEPT
fi
}
# Container lifecycle management
create_container() {
local name=$1
local image=$2
local command=$3
local container_id=$(uuidgen | tr -d '-' | head -c 12)
local container_path="$CONTAINER_DIR/stopped/$container_id"
mkdir -p $container_path
cat > $container_path/config.json << EOF
{
"id": "$container_id",
"name": "$name",
"image": "$image",
"command": "$command",
"created": "$(date -Iseconds)",
"state": "created"
}
EOF
echo $container_id
}
start_container() {
local container_id=$1
local container_path="$CONTAINER_DIR/stopped/$container_id"
if [ ! -d "$container_path" ]; then
echo "Container $container_id not found"
return 1
fi
# Read configuration
local config=$(cat $container_path/config.json)
local name=$(echo $config | jq -r '.name')
local image=$(echo $config | jq -r '.image')
local command=$(echo $config | jq -r '.command')
# Allocate IP address
local ip_suffix=$(($RANDOM % 254 + 2))
local container_ip="172.17.0.$ip_suffix"
# Create container namespace and run
local pid=$(nohup unshare --pid --net --uts --ipc --mount --fork \
bash -c "
# Setup networking
hostname $name
# Setup veth pair
veth_host=\"veth${container_id:0:8}h\"
veth_container=\"veth${container_id:0:8}c\"
# Create veth pair in host namespace
ip link add \$veth_host type veth peer name \$veth_container
ip link set \$veth_host master $BRIDGE_NAME
ip link set \$veth_host up
# Move container veth to container namespace
ip link set \$veth_container netns \$\$
# Configure container interface
ip addr add $container_ip/16 dev \$veth_container
ip link set \$veth_container up
ip route add default via 172.17.0.1
# Execute command
exec $command
" > $container_path/stdout.log 2> $container_path/stderr.log & echo $!)
# Move to running directory
mv $container_path $CONTAINER_DIR/running/$container_id
# Update container state
jq '.state = "running" | .pid = '$pid' | .ip = "'$container_ip'"' \
$CONTAINER_DIR/running/$container_id/config.json > /tmp/config.tmp
mv /tmp/config.tmp $CONTAINER_DIR/running/$container_id/config.json
echo "Container $container_id started (PID: $pid, IP: $container_ip)"
}
stop_container() {
local container_id=$1
local container_path="$CONTAINER_DIR/running/$container_id"
if [ ! -d "$container_path" ]; then
echo "Container $container_id not running"
return 1
fi
local pid=$(jq -r '.pid' $container_path/config.json)
# Send SIGTERM then SIGKILL
kill -TERM $pid 2>/dev/null
sleep 5
kill -KILL $pid 2>/dev/null
# Clean up networking
local veth_host="veth${container_id:0:8}h"
ip link delete $veth_host 2>/dev/null
# Move to stopped directory
mv $container_path $CONTAINER_DIR/stopped/$container_id
# Update state
jq '.state = "stopped" | .stopped = "'$(date -Iseconds)'"' \
$CONTAINER_DIR/stopped/$container_id/config.json > /tmp/config.tmp
mv /tmp/config.tmp $CONTAINER_DIR/stopped/$container_id/config.json
echo "Container $container_id stopped"
}
list_containers() {
echo "CONTAINER ID NAME STATE IP ADDRESS COMMAND"
echo "============ ==== ===== ========== ======="
for state_dir in running stopped; do
for container_path in $CONTAINER_DIR/$state_dir/*; do
if [ -f "$container_path/config.json" ]; then
local config=$(cat $container_path/config.json)
local id=$(basename $container_path)
local name=$(echo $config | jq -r '.name')
local state=$(echo $config | jq -r '.state')
local ip=$(echo $config | jq -r '.ip // "N/A"')
local command=$(echo $config | jq -r '.command')
printf "%-15s %-14s %-10s %-15s %s\n" \
"${id:0:12}" "$name" "$state" "$ip" "$command"
fi
done
done
}
# Resource monitoring
monitor_containers() {
echo "=== Container Resource Usage ==="
for container_path in $CONTAINER_DIR/running/*; do
if [ -f "$container_path/config.json" ]; then
local config=$(cat $container_path/config.json)
local id=$(basename $container_path)
local name=$(echo $config | jq -r '.name')
local pid=$(echo $config | jq -r '.pid')
echo "Container: $name ($id)"
if [ -d "/proc/$pid" ]; then
# CPU usage
local cpu_usage=$(ps -p $pid -o %cpu --no-headers)
echo " CPU: $cpu_usage%"
# Memory usage
local mem_usage=$(ps -p $pid -o rss --no-headers)
echo " Memory: $((mem_usage / 1024)) MB"
# Process count
local proc_count=$(pstree -p $pid | grep -o '([0-9]*)' | wc -l)
echo " Processes: $proc_count"
else
echo " Status: Not running"
fi
echo
fi
done
}
# Main command interface
case "$1" in
init)
init_container_env
;;
create)
create_container "$2" "$3" "$4"
;;
start)
start_container "$2"
;;
stop)
stop_container "$2"
;;
list)
list_containers
;;
monitor)
monitor_containers
;;
*)
echo "Usage: $0 {init|create|start|stop|list|monitor}"
echo " create <name> <image> <command>"
echo " start <container_id>"
echo " stop <container_id>"
;;
esac
Conclusion
Container technology represents a sophisticated orchestration of Linux kernel features. Understanding namespaces, cgroups, and security mechanisms at a deep level enables building robust, secure containerized systems. From simple isolation to complex orchestration, these fundamental technologies power modern cloud infrastructure.
The techniques covered here—namespace management, resource control, security hardening, and runtime implementation—provide the foundation for understanding and extending container technology. Whether you’re building custom runtimes, debugging container issues, or implementing security policies, mastering these internals is essential for modern infrastructure development.
Container internals knowledge also enables better troubleshooting, performance optimization, and security analysis in production environments. As containers continue to evolve, understanding these core concepts ensures you can adapt to new technologies and solve complex challenges in containerized systems.