The Cloud Controller Manager enables Kubernetes to interact with cloud provider APIs, managing resources like load balancers and storage. This deep dive explores its architecture, implementation, and cloud provider integration.

Architecture Overview

Component Architecture

Kubernetes -> Cloud Controller Manager -> Cloud Provider API
                                     -> Node Controller
                                     -> Route Controller
                                     -> Service Controller

Key Components

  1. Node Controller

    • Node Lifecycle
    • Node Labeling
    • Node Address Management
  2. Route Controller

    • Network Routes
    • Pod CIDR Assignment
    • Cloud Network Integration
  3. Service Controller

    • Load Balancer Provisioning
    • Service Endpoint Management
    • External IP Management

Cloud Provider Integration

1. Cloud Provider Interface

// Cloud provider interface
type Interface interface {
    Initialize(clientBuilder controller.ControllerClientBuilder, stop <-chan struct{})
    LoadBalancer() (LoadBalancer, bool)
    Instances() (Instances, bool)
    Zones() (Zones, bool)
    Clusters() (Clusters, bool)
    Routes() (Routes, bool)
    ProviderName() string
    HasClusterID() bool
}

// Load balancer interface
type LoadBalancer interface {
    GetLoadBalancer(ctx context.Context, clusterName string, service *v1.Service) (*LoadBalancerStatus, bool, error)
    EnsureLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) (*LoadBalancerStatus, error)
    UpdateLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) error
    EnsureLoadBalancerDeleted(ctx context.Context, clusterName string, service *v1.Service) error
}

2. Provider Configuration

apiVersion: v1
kind: ConfigMap
metadata:
  name: cloud-config
  namespace: kube-system
data:
  cloud.conf: |
    [Global]
    zone = "us-east-1a"
    vpc-id = "vpc-123456"
    subnet-id = "subnet-123456"
    
    [LoadBalancer]
    use-instance-security-groups = true
    security-group-ids = "sg-123456"

Controller Implementation

1. Node Controller

// Node controller operations
type NodeController struct {
    cloud cloudprovider.Interface
    kubeClient clientset.Interface
    nodeInformer coreinformers.NodeInformer
    
    // Node monitor period
    nodeMonitorPeriod time.Duration
    
    // Node initialization timeout
    nodeInitializationTimeout time.Duration
}

func (nc *NodeController) Run(stopCh <-chan struct{}) {
    defer utilruntime.HandleCrash()
    
    // Start informer factories
    go nc.nodeInformer.Informer().Run(stopCh)
    
    // Wait for caches to sync
    if !cache.WaitForCacheSync(stopCh, nc.nodeInformer.Informer().HasSynced) {
        return
    }
    
    // Start workers
    for i := 0; i < workers; i++ {
        go wait.Until(nc.worker, time.Second, stopCh)
    }
    
    <-stopCh
}

2. Service Controller

apiVersion: v1
kind: Service
metadata:
  name: cloud-lb
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-type: nlb
    service.beta.kubernetes.io/aws-load-balancer-internal: "true"
spec:
  type: LoadBalancer
  ports:
  - port: 80
    targetPort: 8080
  selector:
    app: web

Resource Management

1. Load Balancer Management

apiVersion: v1
kind: Service
metadata:
  name: cloud-lb
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "arn:aws:acm:region:account:certificate/cert-id"
    service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
    service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"
spec:
  type: LoadBalancer
  ports:
  - port: 443
    targetPort: 8080
  selector:
    app: web

2. Node Management

apiVersion: v1
kind: Node
metadata:
  name: cloud-node
  labels:
    failure-domain.beta.kubernetes.io/zone: us-east-1a
    node.kubernetes.io/instance-type: m5.large
spec:
  providerID: aws:///us-east-1a/i-0123456789abcdef0

Performance Tuning

1. Controller Settings

apiVersion: v1
kind: ConfigMap
metadata:
  name: cloud-controller-manager-config
  namespace: kube-system
data:
  config.yaml: |
    kind: KubeControllerManagerConfiguration
    apiVersion: controller-manager.config.k8s.io/v1alpha1
    controllers:
    - cloud-node
    - cloud-node-lifecycle
    - service
    - route
    leaderElection:
      leaderElect: true
    cloudProvider:
      name: aws
      cloudConfigFile: /etc/kubernetes/cloud.conf

2. Resource Configuration

apiVersion: v1
kind: Pod
metadata:
  name: cloud-controller-manager
  namespace: kube-system
spec:
  containers:
  - name: cloud-controller-manager
    resources:
      requests:
        cpu: "200m"
        memory: "512Mi"
      limits:
        cpu: "1"
        memory: "1Gi"

Monitoring and Metrics

1. Controller Metrics

apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: cloud-controller-metrics
spec:
  endpoints:
  - interval: 30s
    port: metrics
  selector:
    matchLabels:
      k8s-app: cloud-controller-manager

2. Important Metrics

# Key metrics to monitor
cloudprovider_aws_api_request_duration_seconds
cloudprovider_aws_api_request_errors
cloud_controller_manager_workqueue_adds_total
cloud_controller_manager_workqueue_depth

Troubleshooting

Common Issues

  1. Load Balancer Issues
# Check service status
kubectl describe service cloud-lb

# View controller logs
kubectl logs -n kube-system cloud-controller-manager

# Check cloud provider events
kubectl get events --field-selector reason=FailedLoadBalancer
  1. Node Problems
# Verify node status
kubectl describe node cloud-node

# Check node controller logs
kubectl logs -n kube-system cloud-controller-manager -c cloud-node-controller

# View cloud provider node status
kubectl get nodes -o custom-columns=NAME:.metadata.name,PROVIDER-ID:.spec.providerID
  1. Route Issues
# Check route controller logs
kubectl logs -n kube-system cloud-controller-manager -c route-controller

# Verify route tables
kubectl get nodes -o custom-columns=NAME:.metadata.name,CIDR:.spec.podCIDR

Best Practices

  1. High Availability

    • Deploy multiple replicas
    • Use proper leader election
    • Configure proper timeouts
    • Implement health checks
  2. Security

    • Use IAM roles/service accounts
    • Limit cloud provider permissions
    • Enable audit logging
    • Implement network policies
  3. Performance

    • Configure rate limiting
    • Set appropriate timeouts
    • Monitor API quotas
    • Use caching effectively

Advanced Configuration

1. Custom Cloud Provider

// Custom cloud provider implementation
type CustomCloudProvider struct {
    client    *CustomClient
    nodeInfos map[string]*NodeInfo
}

func (c *CustomCloudProvider) Initialize(clientBuilder controller.ControllerClientBuilder, stop <-chan struct{}) {
    c.client = NewCustomClient()
    go c.syncNodeInfos(stop)
}

func (c *CustomCloudProvider) ProviderName() string {
    return "custom-cloud"
}

2. Load Balancer Customization

apiVersion: v1
kind: Service
metadata:
  name: custom-lb
  annotations:
    custom.cloud.provider/lb-type: "application"
    custom.cloud.provider/target-type: "ip"
    custom.cloud.provider/subnet-selection: "private"
spec:
  type: LoadBalancer
  ports:
  - port: 443
    targetPort: 8080

For more information, check out: