Advanced Signal Processing in Linux Systems: Real-Time Analysis and Implementation // Support Tools

Advanced signal processing in Linux systems requires deep understanding of both mathematical algorithms and system-level optimization techniques. This comprehensive guide explores real-time digital signal processing (DSP) implementation, from low-level kernel interfaces to high-performance algorithmic optimization, enabling the development of professional-grade signal processing applications.

Signal Processing Fundamentals and System Architecture

Modern signal processing applications demand both mathematical precision and computational efficiency. Linux provides multiple pathways for signal acquisition, processing, and output, each with distinct performance characteristics.

Real-Time Signal Processing Architecture

#include <alsa/asoundlib.h>
#include <pthread.h>
#include <sched.h>
#include <sys/mman.h>
#include <immintrin.h>

typedef struct {
    snd_pcm_t *capture_handle;
    snd_pcm_t *playback_handle;
    snd_pcm_hw_params_t *hw_params;
    unsigned int sample_rate;
    unsigned int channels;
    snd_pcm_format_t format;
    snd_pcm_uframes_t frames_per_period;
    snd_pcm_uframes_t buffer_size;
    
    // Processing buffers
    float *input_buffer;
    float *output_buffer;
    float *processing_buffer;
    
    // Processing chain
    void (*process_callback)(float *input, float *output, 
                           unsigned int frames, void *user_data);
    void *user_data;
    
    // Real-time control
    pthread_t processing_thread;
    volatile int running;
    int priority;
    
    // Performance metrics
    unsigned long long total_frames;
    unsigned long long xruns;
    struct timespec last_process_time;
} realtime_processor_t;

// Initialize real-time audio processing system
int initialize_realtime_processor(realtime_processor_t *processor,
                                 const char *device_name,
                                 unsigned int sample_rate,
                                 unsigned int channels,
                                 snd_pcm_uframes_t period_size) {
    int err;
    
    processor->sample_rate = sample_rate;
    processor->channels = channels;
    processor->format = SND_PCM_FORMAT_FLOAT_LE;
    processor->frames_per_period = period_size;
    processor->buffer_size = period_size * 4; // 4 periods
    
    // Open capture device
    err = snd_pcm_open(&processor->capture_handle, device_name,
                      SND_PCM_STREAM_CAPTURE, SND_PCM_NONBLOCK);
    if (err < 0) return err;
    
    // Open playback device
    err = snd_pcm_open(&processor->playback_handle, device_name,
                      SND_PCM_STREAM_PLAYBACK, SND_PCM_NONBLOCK);
    if (err < 0) return err;
    
    // Configure hardware parameters
    snd_pcm_hw_params_alloca(&processor->hw_params);
    
    // Capture configuration
    snd_pcm_hw_params_any(processor->capture_handle, processor->hw_params);
    snd_pcm_hw_params_set_access(processor->capture_handle, 
                                processor->hw_params, 
                                SND_PCM_ACCESS_RW_INTERLEAVED);
    snd_pcm_hw_params_set_format(processor->capture_handle,
                                processor->hw_params, processor->format);
    snd_pcm_hw_params_set_channels(processor->capture_handle,
                                  processor->hw_params, channels);
    snd_pcm_hw_params_set_rate_near(processor->capture_handle,
                                   processor->hw_params, &sample_rate, 0);
    snd_pcm_hw_params_set_period_size_near(processor->capture_handle,
                                          processor->hw_params,
                                          &processor->frames_per_period, 0);
    snd_pcm_hw_params_set_buffer_size_near(processor->capture_handle,
                                          processor->hw_params,
                                          &processor->buffer_size);
    
    err = snd_pcm_hw_params(processor->capture_handle, processor->hw_params);
    if (err < 0) return err;
    
    // Similar configuration for playback
    snd_pcm_hw_params_any(processor->playback_handle, processor->hw_params);
    // ... (similar setup for playback)
    
    // Allocate processing buffers
    size_t buffer_bytes = processor->frames_per_period * channels * sizeof(float);
    processor->input_buffer = aligned_alloc(32, buffer_bytes);
    processor->output_buffer = aligned_alloc(32, buffer_bytes);
    processor->processing_buffer = aligned_alloc(32, buffer_bytes);
    
    if (!processor->input_buffer || !processor->output_buffer || 
        !processor->processing_buffer) {
        return -ENOMEM;
    }
    
    // Lock memory to prevent paging
    mlock(processor->input_buffer, buffer_bytes);
    mlock(processor->output_buffer, buffer_bytes);
    mlock(processor->processing_buffer, buffer_bytes);
    
    return 0;
}

// Real-time processing thread
void* realtime_processing_thread(void *arg) {
    realtime_processor_t *processor = (realtime_processor_t *)arg;
    
    // Set real-time scheduling
    struct sched_param param;
    param.sched_priority = processor->priority;
    if (sched_setscheduler(0, SCHED_FIFO, &param) < 0) {
        perror("Failed to set real-time scheduling");
    }
    
    // Start audio streams
    snd_pcm_prepare(processor->capture_handle);
    snd_pcm_prepare(processor->playback_handle);
    snd_pcm_start(processor->capture_handle);
    snd_pcm_start(processor->playback_handle);
    
    while (processor->running) {
        struct timespec start_time;
        clock_gettime(CLOCK_MONOTONIC, &start_time);
        
        // Read audio input
        snd_pcm_sframes_t frames_read = snd_pcm_readi(
            processor->capture_handle,
            processor->input_buffer,
            processor->frames_per_period);
        
        if (frames_read < 0) {
            if (frames_read == -EPIPE) {
                // Handle xrun
                processor->xruns++;
                snd_pcm_prepare(processor->capture_handle);
                snd_pcm_start(processor->capture_handle);
                continue;
            }
        }
        
        // Process audio
        if (processor->process_callback && frames_read > 0) {
            processor->process_callback(processor->input_buffer,
                                      processor->output_buffer,
                                      frames_read,
                                      processor->user_data);
        }
        
        // Write audio output
        snd_pcm_sframes_t frames_written = snd_pcm_writei(
            processor->playback_handle,
            processor->output_buffer,
            frames_read);
        
        if (frames_written < 0) {
            if (frames_written == -EPIPE) {
                processor->xruns++;
                snd_pcm_prepare(processor->playback_handle);
                continue;
            }
        }
        
        processor->total_frames += frames_read;
        processor->last_process_time = start_time;
    }
    
    return NULL;
}

Advanced Digital Filter Implementation

Digital filters are fundamental components of signal processing systems, requiring efficient implementation for real-time operation.

High-Performance FIR Filter Implementation

// SIMD-optimized FIR filter
typedef struct {
    float *coefficients;
    float *delay_line;
    int num_taps;
    int delay_index;
    int block_size;
} fir_filter_t;

fir_filter_t* create_fir_filter(const float *coeffs, int num_taps) {
    fir_filter_t *filter = malloc(sizeof(fir_filter_t));
    
    // Align coefficient array for SIMD
    filter->num_taps = num_taps;
    filter->coefficients = aligned_alloc(32, num_taps * sizeof(float));
    filter->delay_line = aligned_alloc(32, num_taps * sizeof(float));
    
    memcpy(filter->coefficients, coeffs, num_taps * sizeof(float));
    memset(filter->delay_line, 0, num_taps * sizeof(float));
    
    filter->delay_index = 0;
    filter->block_size = 8; // Process 8 samples at once for AVX
    
    return filter;
}

// AVX-optimized FIR filter processing
void fir_filter_process_avx(fir_filter_t *filter, 
                           const float *input, 
                           float *output, 
                           int num_samples) {
    const int num_taps = filter->num_taps;
    const int block_size = filter->block_size;
    
    for (int i = 0; i < num_samples; i += block_size) {
        int samples_to_process = (i + block_size <= num_samples) ? 
                                block_size : num_samples - i;
        
        __m256 acc = _mm256_setzero_ps();
        
        // Process in blocks of 8 coefficients
        for (int j = 0; j < num_taps; j += 8) {
            int coeffs_to_process = (j + 8 <= num_taps) ? 8 : num_taps - j;
            
            // Load coefficients
            __m256 coeffs = _mm256_load_ps(&filter->coefficients[j]);
            
            // Prepare delay line data
            __m256 delay_data = _mm256_setzero_ps();
            
            for (int k = 0; k < coeffs_to_process; k++) {
                int delay_idx = (filter->delay_index - j - k + num_taps) % num_taps;
                ((float*)&delay_data)[k] = filter->delay_line[delay_idx];
            }
            
            // Multiply and accumulate
            __m256 product = _mm256_mul_ps(coeffs, delay_data);
            acc = _mm256_add_ps(acc, product);
        }
        
        // Horizontal sum of accumulated values
        __m128 sum_high = _mm256_extractf128_ps(acc, 1);
        __m128 sum_low = _mm256_extractf128_ps(acc, 0);
        __m128 sum = _mm_add_ps(sum_high, sum_low);
        
        sum = _mm_hadd_ps(sum, sum);
        sum = _mm_hadd_ps(sum, sum);
        
        output[i] = _mm_cvtss_f32(sum);
        
        // Update delay line
        filter->delay_line[filter->delay_index] = input[i];
        filter->delay_index = (filter->delay_index + 1) % num_taps;
    }
}

// Cascade biquad IIR filter for complex responses
typedef struct {
    float b0, b1, b2; // Numerator coefficients
    float a1, a2;     // Denominator coefficients (a0 = 1)
    float x1, x2;     // Input delay elements
    float y1, y2;     // Output delay elements
} biquad_section_t;

typedef struct {
    biquad_section_t *sections;
    int num_sections;
    float overall_gain;
} cascade_iir_filter_t;

// Process samples through cascaded biquad sections
void cascade_iir_process(cascade_iir_filter_t *filter,
                        const float *input,
                        float *output,
                        int num_samples) {
    for (int i = 0; i < num_samples; i++) {
        float sample = input[i] * filter->overall_gain;
        
        // Process through each biquad section
        for (int j = 0; j < filter->num_sections; j++) {
            biquad_section_t *section = &filter->sections[j];
            
            // Direct Form II implementation
            float w = sample - section->a1 * section->y1 - section->a2 * section->y2;
            sample = section->b0 * w + section->b1 * section->x1 + section->b2 * section->x2;
            
            // Update delay elements
            section->x2 = section->x1;
            section->x1 = w;
            section->y2 = section->y1;
            section->y1 = sample;
        }
        
        output[i] = sample;
    }
}

// Adaptive filter implementation (LMS algorithm)
typedef struct {
    float *weights;
    float *delay_line;
    int num_taps;
    int delay_index;
    float step_size;
    float *error_history;
    int error_history_size;
    int error_index;
} adaptive_filter_t;

float adaptive_filter_process(adaptive_filter_t *filter,
                             float input_sample,
                             float desired_output) {
    // Update delay line
    filter->delay_line[filter->delay_index] = input_sample;
    
    // Calculate filter output
    float output = 0.0f;
    for (int i = 0; i < filter->num_taps; i++) {
        int idx = (filter->delay_index - i + filter->num_taps) % filter->num_taps;
        output += filter->weights[i] * filter->delay_line[idx];
    }
    
    // Calculate error
    float error = desired_output - output;
    
    // Update weights using LMS algorithm
    for (int i = 0; i < filter->num_taps; i++) {
        int idx = (filter->delay_index - i + filter->num_taps) % filter->num_taps;
        filter->weights[i] += filter->step_size * error * filter->delay_line[idx];
    }
    
    // Store error for convergence analysis
    filter->error_history[filter->error_index] = error * error;
    filter->error_index = (filter->error_index + 1) % filter->error_history_size;
    
    // Update delay index
    filter->delay_index = (filter->delay_index + 1) % filter->num_taps;
    
    return output;
}

Fast Fourier Transform Optimization

FFT is central to many signal processing applications, requiring highly optimized implementations for real-time performance.

Cache-Optimized Radix-4 FFT

#include <complex.h>
#include <math.h>

typedef float complex cfloat;

// Twiddle factor calculation and caching
typedef struct {
    cfloat *twiddle_factors;
    int *bit_reversal_table;
    int size;
    int log2_size;
} fft_context_t;

fft_context_t* create_fft_context(int size) {
    if ((size & (size - 1)) != 0) {
        return NULL; // Size must be power of 2
    }
    
    fft_context_t *ctx = malloc(sizeof(fft_context_t));
    ctx->size = size;
    ctx->log2_size = 0;
    int temp = size;
    while (temp > 1) {
        temp >>= 1;
        ctx->log2_size++;
    }
    
    // Precompute twiddle factors
    ctx->twiddle_factors = aligned_alloc(32, size * sizeof(cfloat));
    for (int i = 0; i < size; i++) {
        float angle = -2.0f * M_PI * i / size;
        ctx->twiddle_factors[i] = cosf(angle) + I * sinf(angle);
    }
    
    // Precompute bit-reversal table
    ctx->bit_reversal_table = malloc(size * sizeof(int));
    for (int i = 0; i < size; i++) {
        int reversed = 0;
        int temp = i;
        for (int bit = 0; bit < ctx->log2_size; bit++) {
            reversed = (reversed << 1) | (temp & 1);
            temp >>= 1;
        }
        ctx->bit_reversal_table[i] = reversed;
    }
    
    return ctx;
}

// Optimized radix-4 FFT implementation
void fft_radix4_inplace(fft_context_t *ctx, cfloat *data) {
    const int size = ctx->size;
    const int log4_size = ctx->log2_size / 2;
    
    // Bit-reverse permutation
    for (int i = 0; i < size; i++) {
        int j = ctx->bit_reversal_table[i];
        if (i < j) {
            cfloat temp = data[i];
            data[i] = data[j];
            data[j] = temp;
        }
    }
    
    // Radix-4 computation
    for (int stage = 0; stage < log4_size; stage++) {
        int group_size = 1 << (2 * stage);
        int num_groups = size / (4 * group_size);
        int twiddle_step = size / (4 * group_size);
        
        for (int group = 0; group < num_groups; group++) {
            cfloat w1 = ctx->twiddle_factors[group * twiddle_step];
            cfloat w2 = ctx->twiddle_factors[2 * group * twiddle_step];
            cfloat w3 = ctx->twiddle_factors[3 * group * twiddle_step];
            
            for (int i = 0; i < group_size; i++) {
                int base = group * 4 * group_size + i;
                
                cfloat x0 = data[base];
                cfloat x1 = data[base + group_size] * w1;
                cfloat x2 = data[base + 2 * group_size] * w2;
                cfloat x3 = data[base + 3 * group_size] * w3;
                
                // Radix-4 butterfly
                cfloat temp1 = x0 + x2;
                cfloat temp2 = x0 - x2;
                cfloat temp3 = x1 + x3;
                cfloat temp4 = (x1 - x3) * I;
                
                data[base] = temp1 + temp3;
                data[base + group_size] = temp2 + temp4;
                data[base + 2 * group_size] = temp1 - temp3;
                data[base + 3 * group_size] = temp2 - temp4;
            }
        }
    }
}

// SIMD-optimized FFT for real-valued signals
void fft_real_optimized(fft_context_t *ctx, const float *real_input, 
                       cfloat *complex_output) {
    const int size = ctx->size;
    const int half_size = size / 2;
    
    // Pack real data into complex format
    cfloat *packed_data = aligned_alloc(32, half_size * sizeof(cfloat));
    
    for (int i = 0; i < half_size; i++) {
        packed_data[i] = real_input[2*i] + I * real_input[2*i + 1];
    }
    
    // Perform half-size complex FFT
    fft_radix4_inplace(ctx, packed_data);
    
    // Unpack and reconstruct full spectrum
    complex_output[0] = crealf(packed_data[0]) + cimagf(packed_data[0]);
    complex_output[half_size] = crealf(packed_data[0]) - cimagf(packed_data[0]);
    
    for (int k = 1; k < half_size; k++) {
        cfloat Fk = packed_data[k];
        cfloat Fnk = conjf(packed_data[half_size - k]);
        
        cfloat Hk = 0.5f * (Fk + Fnk);
        cfloat Gk = -0.5f * I * (Fk - Fnk);
        
        cfloat wk = ctx->twiddle_factors[k];
        
        complex_output[k] = Hk + Gk * wk;
        complex_output[size - k] = conjf(Hk - Gk * wk);
    }
    
    free(packed_data);
}

// Overlap-add convolution using FFT
void fft_convolution(fft_context_t *fft_ctx, 
                    const float *signal, int signal_length,
                    const float *kernel, int kernel_length,
                    float *output) {
    const int fft_size = fft_ctx->size;
    const int overlap_size = kernel_length - 1;
    const int hop_size = fft_size - overlap_size;
    
    // Prepare kernel FFT
    cfloat *kernel_fft = aligned_alloc(32, fft_size * sizeof(cfloat));
    for (int i = 0; i < kernel_length; i++) {
        kernel_fft[i] = kernel[i];
    }
    for (int i = kernel_length; i < fft_size; i++) {
        kernel_fft[i] = 0.0f;
    }
    fft_radix4_inplace(fft_ctx, kernel_fft);
    
    // Overlap buffer
    float *overlap_buffer = calloc(overlap_size, sizeof(float));
    
    cfloat *signal_fft = aligned_alloc(32, fft_size * sizeof(cfloat));
    cfloat *result_fft = aligned_alloc(32, fft_size * sizeof(cfloat));
    
    int output_pos = 0;
    
    for (int pos = 0; pos < signal_length; pos += hop_size) {
        // Prepare signal block
        for (int i = 0; i < fft_size; i++) {
            if (pos + i < signal_length) {
                signal_fft[i] = signal[pos + i];
            } else {
                signal_fft[i] = 0.0f;
            }
        }
        
        // Forward FFT
        fft_radix4_inplace(fft_ctx, signal_fft);
        
        // Frequency domain multiplication
        for (int i = 0; i < fft_size; i++) {
            result_fft[i] = signal_fft[i] * kernel_fft[i];
        }
        
        // Inverse FFT
        for (int i = 0; i < fft_size; i++) {
            result_fft[i] = conjf(result_fft[i]);
        }
        fft_radix4_inplace(fft_ctx, result_fft);
        for (int i = 0; i < fft_size; i++) {
            result_fft[i] = conjf(result_fft[i]) / fft_size;
        }
        
        // Overlap-add
        for (int i = 0; i < overlap_size && output_pos + i < signal_length; i++) {
            output[output_pos + i] = crealf(result_fft[i]) + overlap_buffer[i];
        }
        
        // Save overlap for next iteration
        for (int i = 0; i < overlap_size; i++) {
            overlap_buffer[i] = crealf(result_fft[hop_size + i]);
        }
        
        output_pos += hop_size;
    }
    
    free(overlap_buffer);
    free(signal_fft);
    free(result_fft);
    free(kernel_fft);
}

Advanced Spectral Analysis

Sophisticated spectral analysis techniques enable deep signal understanding and feature extraction.

Multi-Resolution Spectral Analysis

// Continuous Wavelet Transform implementation
typedef struct {
    float *scales;
    int num_scales;
    cfloat **wavelets;
    int wavelet_length;
    char wavelet_type[32];
} cwt_context_t;

// Morlet wavelet generation
void generate_morlet_wavelet(cfloat *wavelet, int length, 
                            float scale, float center_freq) {
    const float sigma = scale / (2.0f * M_PI * center_freq);
    const int center = length / 2;
    
    for (int i = 0; i < length; i++) {
        float t = (i - center) / (float)length;
        float gaussian = expf(-0.5f * (t * t) / (sigma * sigma));
        float oscillation = cosf(2.0f * M_PI * center_freq * t) + 
                           I * sinf(2.0f * M_PI * center_freq * t);
        
        wavelet[i] = gaussian * oscillation / sqrtf(scale);
    }
}

cwt_context_t* create_cwt_context(const float *scales, int num_scales,
                                 int signal_length, const char *wavelet_type) {
    cwt_context_t *ctx = malloc(sizeof(cwt_context_t));
    
    ctx->num_scales = num_scales;
    ctx->scales = malloc(num_scales * sizeof(float));
    memcpy(ctx->scales, scales, num_scales * sizeof(float));
    strcpy(ctx->wavelet_type, wavelet_type);
    
    // Determine wavelet length based on largest scale
    float max_scale = scales[0];
    for (int i = 1; i < num_scales; i++) {
        if (scales[i] > max_scale) max_scale = scales[i];
    }
    ctx->wavelet_length = (int)(8 * max_scale);
    if (ctx->wavelet_length > signal_length) {
        ctx->wavelet_length = signal_length;
    }
    
    // Pregenerate wavelets for all scales
    ctx->wavelets = malloc(num_scales * sizeof(cfloat*));
    for (int i = 0; i < num_scales; i++) {
        ctx->wavelets[i] = aligned_alloc(32, ctx->wavelet_length * sizeof(cfloat));
        
        if (strcmp(wavelet_type, "morlet") == 0) {
            generate_morlet_wavelet(ctx->wavelets[i], ctx->wavelet_length, 
                                   scales[i], 1.0f);
        }
    }
    
    return ctx;
}

// Compute CWT using convolution
void compute_cwt(cwt_context_t *ctx, const float *signal, int signal_length,
                cfloat **coefficients) {
    for (int scale_idx = 0; scale_idx < ctx->num_scales; scale_idx++) {
        cfloat *wavelet = ctx->wavelets[scale_idx];
        cfloat *scale_coeffs = coefficients[scale_idx];
        
        // Convolution with wavelet
        for (int n = 0; n < signal_length; n++) {
            cfloat sum = 0.0f;
            
            for (int k = 0; k < ctx->wavelet_length; k++) {
                int signal_idx = n - k + ctx->wavelet_length / 2;
                if (signal_idx >= 0 && signal_idx < signal_length) {
                    sum += signal[signal_idx] * conjf(wavelet[k]);
                }
            }
            
            scale_coeffs[n] = sum;
        }
    }
}

// Short-Time Fourier Transform with advanced windowing
typedef struct {
    fft_context_t *fft_ctx;
    float *window;
    int window_length;
    int hop_size;
    int fft_size;
    char window_type[32];
    float *magnitude_spectrum;
    float *phase_spectrum;
} stft_context_t;

// Generate various window functions
void generate_window(float *window, int length, const char *type) {
    if (strcmp(type, "hann") == 0) {
        for (int i = 0; i < length; i++) {
            window[i] = 0.5f * (1.0f - cosf(2.0f * M_PI * i / (length - 1)));
        }
    } else if (strcmp(type, "hamming") == 0) {
        for (int i = 0; i < length; i++) {
            window[i] = 0.54f - 0.46f * cosf(2.0f * M_PI * i / (length - 1));
        }
    } else if (strcmp(type, "blackman") == 0) {
        for (int i = 0; i < length; i++) {
            float n = i / (float)(length - 1);
            window[i] = 0.42f - 0.5f * cosf(2.0f * M_PI * n) + 
                       0.08f * cosf(4.0f * M_PI * n);
        }
    } else { // Rectangular
        for (int i = 0; i < length; i++) {
            window[i] = 1.0f;
        }
    }
}

stft_context_t* create_stft_context(int window_length, int hop_size,
                                   const char *window_type) {
    stft_context_t *ctx = malloc(sizeof(stft_context_t));
    
    ctx->window_length = window_length;
    ctx->hop_size = hop_size;
    ctx->fft_size = window_length;
    strcpy(ctx->window_type, window_type);
    
    // Create FFT context
    ctx->fft_ctx = create_fft_context(ctx->fft_size);
    
    // Generate window
    ctx->window = malloc(window_length * sizeof(float));
    generate_window(ctx->window, window_length, window_type);
    
    // Allocate spectrum arrays
    ctx->magnitude_spectrum = malloc((ctx->fft_size / 2 + 1) * sizeof(float));
    ctx->phase_spectrum = malloc((ctx->fft_size / 2 + 1) * sizeof(float));
    
    return ctx;
}

// Compute STFT frame
void compute_stft_frame(stft_context_t *ctx, const float *signal, 
                       int signal_length, int frame_start,
                       float *magnitude, float *phase) {
    cfloat *windowed_signal = aligned_alloc(32, ctx->fft_size * sizeof(cfloat));
    
    // Apply window and prepare for FFT
    for (int i = 0; i < ctx->window_length; i++) {
        int signal_idx = frame_start + i;
        if (signal_idx >= 0 && signal_idx < signal_length) {
            windowed_signal[i] = signal[signal_idx] * ctx->window[i];
        } else {
            windowed_signal[i] = 0.0f;
        }
    }
    
    // Zero-pad if necessary
    for (int i = ctx->window_length; i < ctx->fft_size; i++) {
        windowed_signal[i] = 0.0f;
    }
    
    // Compute FFT
    fft_radix4_inplace(ctx->fft_ctx, windowed_signal);
    
    // Extract magnitude and phase
    for (int i = 0; i <= ctx->fft_size / 2; i++) {
        cfloat bin = windowed_signal[i];
        magnitude[i] = cabsf(bin);
        phase[i] = cargf(bin);
    }
    
    free(windowed_signal);
}

Real-Time Performance Optimization

Critical real-time applications require sophisticated optimization techniques to meet strict timing requirements.

Lock-Free Ring Buffer Implementation

#include <stdatomic.h>

typedef struct {
    float *buffer;
    atomic_size_t write_index;
    atomic_size_t read_index;
    size_t size;
    size_t mask; // size - 1, for power-of-2 sizes
} lockfree_ringbuffer_t;

lockfree_ringbuffer_t* create_lockfree_ringbuffer(size_t size) {
    // Ensure size is power of 2
    if ((size & (size - 1)) != 0) {
        return NULL;
    }
    
    lockfree_ringbuffer_t *rb = malloc(sizeof(lockfree_ringbuffer_t));
    rb->buffer = aligned_alloc(64, size * sizeof(float)); // Cache-line aligned
    rb->size = size;
    rb->mask = size - 1;
    
    atomic_store(&rb->write_index, 0);
    atomic_store(&rb->read_index, 0);
    
    return rb;
}

// Lock-free write operation
int ringbuffer_write(lockfree_ringbuffer_t *rb, const float *data, 
                    size_t samples) {
    size_t write_idx = atomic_load(&rb->write_index);
    size_t read_idx = atomic_load(&rb->read_index);
    
    // Check available space
    size_t available = rb->size - (write_idx - read_idx);
    if (samples > available) {
        return -1; // Buffer full
    }
    
    // Write data in chunks to handle wrap-around
    for (size_t i = 0; i < samples; i++) {
        rb->buffer[(write_idx + i) & rb->mask] = data[i];
    }
    
    // Update write index atomically
    atomic_store(&rb->write_index, write_idx + samples);
    
    return samples;
}

// Lock-free read operation
int ringbuffer_read(lockfree_ringbuffer_t *rb, float *data, size_t samples) {
    size_t read_idx = atomic_load(&rb->read_index);
    size_t write_idx = atomic_load(&rb->write_index);
    
    // Check available data
    size_t available = write_idx - read_idx;
    if (samples > available) {
        samples = available; // Read what's available
    }
    
    // Read data in chunks to handle wrap-around
    for (size_t i = 0; i < samples; i++) {
        data[i] = rb->buffer[(read_idx + i) & rb->mask];
    }
    
    // Update read index atomically
    atomic_store(&rb->read_index, read_idx + samples);
    
    return samples;
}

// CPU affinity and NUMA optimization
void optimize_cpu_affinity(pthread_t thread, int cpu_core) {
    cpu_set_t cpuset;
    CPU_ZERO(&cpuset);
    CPU_SET(cpu_core, &cpuset);
    
    int result = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
    if (result != 0) {
        perror("Failed to set CPU affinity");
    }
}

// Memory prefetching for signal processing
void prefetch_signal_data(const float *signal, int length, int stride) {
    for (int i = 0; i < length; i += stride) {
        __builtin_prefetch(&signal[i], 0, 3); // Prefetch for read, high locality
    }
}

// Cache-optimized signal processing loop
void process_signal_blocks(const float *input, float *output, 
                          int total_samples, int block_size,
                          void (*process_func)(const float*, float*, int)) {
    const int cache_line_samples = 64 / sizeof(float); // 16 samples per cache line
    
    for (int pos = 0; pos < total_samples; pos += block_size) {
        int current_block = (pos + block_size <= total_samples) ? 
                           block_size : total_samples - pos;
        
        // Prefetch next block
        if (pos + block_size < total_samples) {
            prefetch_signal_data(&input[pos + block_size], 
                                current_block, cache_line_samples);
        }
        
        // Process current block
        process_func(&input[pos], &output[pos], current_block);
    }
}

Machine Learning Integration

Modern signal processing increasingly leverages machine learning for adaptive processing and pattern recognition.

Real-Time Neural Network Inference

// Lightweight neural network for real-time signal classification
typedef struct {
    float **weights;
    float **biases;
    int *layer_sizes;
    int num_layers;
    float **activations;
    char activation_type[16];
} neural_network_t;

neural_network_t* create_neural_network(const int *layer_sizes, 
                                       int num_layers,
                                       const char *activation_type) {
    neural_network_t *nn = malloc(sizeof(neural_network_t));
    
    nn->num_layers = num_layers;
    nn->layer_sizes = malloc(num_layers * sizeof(int));
    memcpy(nn->layer_sizes, layer_sizes, num_layers * sizeof(int));
    strcpy(nn->activation_type, activation_type);
    
    // Allocate weights and biases
    nn->weights = malloc((num_layers - 1) * sizeof(float*));
    nn->biases = malloc((num_layers - 1) * sizeof(float*));
    nn->activations = malloc(num_layers * sizeof(float*));
    
    for (int i = 0; i < num_layers - 1; i++) {
        int input_size = layer_sizes[i];
        int output_size = layer_sizes[i + 1];
        
        nn->weights[i] = aligned_alloc(32, input_size * output_size * sizeof(float));
        nn->biases[i] = aligned_alloc(32, output_size * sizeof(float));
        
        // Xavier initialization
        float scale = sqrtf(6.0f / (input_size + output_size));
        for (int j = 0; j < input_size * output_size; j++) {
            nn->weights[i][j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * scale;
        }
        
        memset(nn->biases[i], 0, output_size * sizeof(float));
    }
    
    // Allocate activation arrays
    for (int i = 0; i < num_layers; i++) {
        nn->activations[i] = aligned_alloc(32, layer_sizes[i] * sizeof(float));
    }
    
    return nn;
}

// SIMD-optimized forward pass
void neural_network_forward(neural_network_t *nn, const float *input) {
    // Copy input to first activation layer
    memcpy(nn->activations[0], input, nn->layer_sizes[0] * sizeof(float));
    
    // Forward propagation through layers
    for (int layer = 0; layer < nn->num_layers - 1; layer++) {
        int input_size = nn->layer_sizes[layer];
        int output_size = nn->layer_sizes[layer + 1];
        
        float *weights = nn->weights[layer];
        float *biases = nn->biases[layer];
        float *input_activations = nn->activations[layer];
        float *output_activations = nn->activations[layer + 1];
        
        // Matrix multiplication with SIMD
        for (int out = 0; out < output_size; out++) {
            __m256 sum = _mm256_setzero_ps();
            
            int in;
            for (in = 0; in <= input_size - 8; in += 8) {
                __m256 input_vec = _mm256_load_ps(&input_activations[in]);
                __m256 weight_vec = _mm256_load_ps(&weights[out * input_size + in]);
                sum = _mm256_fmadd_ps(input_vec, weight_vec, sum);
            }
            
            // Handle remaining elements
            float scalar_sum = 0.0f;
            for (; in < input_size; in++) {
                scalar_sum += input_activations[in] * weights[out * input_size + in];
            }
            
            // Horizontal sum of SIMD result
            __m128 sum_high = _mm256_extractf128_ps(sum, 1);
            __m128 sum_low = _mm256_extractf128_ps(sum, 0);
            __m128 sum_total = _mm_add_ps(sum_high, sum_low);
            sum_total = _mm_hadd_ps(sum_total, sum_total);
            sum_total = _mm_hadd_ps(sum_total, sum_total);
            
            float result = _mm_cvtss_f32(sum_total) + scalar_sum + biases[out];
            
            // Apply activation function
            if (strcmp(nn->activation_type, "relu") == 0) {
                output_activations[out] = fmaxf(0.0f, result);
            } else if (strcmp(nn->activation_type, "sigmoid") == 0) {
                output_activations[out] = 1.0f / (1.0f + expf(-result));
            } else if (strcmp(nn->activation_type, "tanh") == 0) {
                output_activations[out] = tanhf(result);
            } else {
                output_activations[out] = result; // Linear
            }
        }
    }
}

// Real-time signal classification
typedef struct {
    neural_network_t *classifier;
    float *feature_buffer;
    int feature_size;
    float *classification_output;
    int num_classes;
    
    // Feature extraction parameters
    stft_context_t *stft_ctx;
    float *spectral_features;
    int num_spectral_bins;
} signal_classifier_t;

int classify_signal_frame(signal_classifier_t *classifier, 
                         const float *signal_frame, int frame_size) {
    // Extract spectral features
    compute_stft_frame(classifier->stft_ctx, signal_frame, frame_size, 0,
                      classifier->spectral_features, NULL);
    
    // Prepare features for neural network
    for (int i = 0; i < classifier->num_spectral_bins; i++) {
        classifier->feature_buffer[i] = logf(classifier->spectral_features[i] + 1e-10f);
    }
    
    // Normalize features
    float mean = 0.0f, std = 0.0f;
    for (int i = 0; i < classifier->feature_size; i++) {
        mean += classifier->feature_buffer[i];
    }
    mean /= classifier->feature_size;
    
    for (int i = 0; i < classifier->feature_size; i++) {
        float diff = classifier->feature_buffer[i] - mean;
        std += diff * diff;
    }
    std = sqrtf(std / classifier->feature_size);
    
    for (int i = 0; i < classifier->feature_size; i++) {
        classifier->feature_buffer[i] = (classifier->feature_buffer[i] - mean) / std;
    }
    
    // Run neural network inference
    neural_network_forward(classifier->classifier, classifier->feature_buffer);
    
    // Find class with highest probability
    int predicted_class = 0;
    float max_prob = classifier->classifier->activations[classifier->classifier->num_layers - 1][0];
    
    for (int i = 1; i < classifier->num_classes; i++) {
        float prob = classifier->classifier->activations[classifier->classifier->num_layers - 1][i];
        if (prob > max_prob) {
            max_prob = prob;
            predicted_class = i;
        }
    }
    
    return predicted_class;
}

Conclusion

Advanced signal processing in Linux systems requires a comprehensive understanding of mathematical algorithms, system-level optimization, and real-time programming techniques. The implementation strategies presented in this guide provide the foundation for building high-performance signal processing applications that can meet the demanding requirements of modern enterprise environments.

Key aspects of successful signal processing implementation include efficient memory management, SIMD optimization, lock-free data structures, and intelligent use of system resources. By combining these techniques with robust algorithm implementations and machine learning integration, developers can create sophisticated signal processing systems capable of handling complex real-world scenarios with minimal latency and maximum reliability.

The examples and patterns demonstrated here serve as building blocks for more complex applications, enabling the development of everything from real-time audio processing systems to advanced sensor data analysis platforms that can operate effectively in production environments.