Deep Dive: Kubernetes Cloud Controllers
The Cloud Controller Manager enables Kubernetes to interact with cloud provider APIs, managing resources like load balancers and storage. This deep dive explores its architecture, implementation, and cloud provider integration.
Architecture Overview
Component Architecture
Kubernetes -> Cloud Controller Manager -> Cloud Provider API
                                     -> Node Controller
                                     -> Route Controller
                                     -> Service Controller
Key Components
- Node Controller - Node Lifecycle
- Node Labeling
- Node Address Management
 
- Route Controller - Network Routes
- Pod CIDR Assignment
- Cloud Network Integration
 
- Service Controller - Load Balancer Provisioning
- Service Endpoint Management
- External IP Management
 
Cloud Provider Integration
1. Cloud Provider Interface
// Cloud provider interface
type Interface interface {
    Initialize(clientBuilder controller.ControllerClientBuilder, stop <-chan struct{})
    LoadBalancer() (LoadBalancer, bool)
    Instances() (Instances, bool)
    Zones() (Zones, bool)
    Clusters() (Clusters, bool)
    Routes() (Routes, bool)
    ProviderName() string
    HasClusterID() bool
}
// Load balancer interface
type LoadBalancer interface {
    GetLoadBalancer(ctx context.Context, clusterName string, service *v1.Service) (*LoadBalancerStatus, bool, error)
    EnsureLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) (*LoadBalancerStatus, error)
    UpdateLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) error
    EnsureLoadBalancerDeleted(ctx context.Context, clusterName string, service *v1.Service) error
}
2. Provider Configuration
apiVersion: v1
kind: ConfigMap
metadata:
  name: cloud-config
  namespace: kube-system
data:
  cloud.conf: |
    [Global]
    zone = "us-east-1a"
    vpc-id = "vpc-123456"
    subnet-id = "subnet-123456"
    
    [LoadBalancer]
    use-instance-security-groups = true
    security-group-ids = "sg-123456"
Controller Implementation
1. Node Controller
// Node controller operations
type NodeController struct {
    cloud cloudprovider.Interface
    kubeClient clientset.Interface
    nodeInformer coreinformers.NodeInformer
    
    // Node monitor period
    nodeMonitorPeriod time.Duration
    
    // Node initialization timeout
    nodeInitializationTimeout time.Duration
}
func (nc *NodeController) Run(stopCh <-chan struct{}) {
    defer utilruntime.HandleCrash()
    
    // Start informer factories
    go nc.nodeInformer.Informer().Run(stopCh)
    
    // Wait for caches to sync
    if !cache.WaitForCacheSync(stopCh, nc.nodeInformer.Informer().HasSynced) {
        return
    }
    
    // Start workers
    for i := 0; i < workers; i++ {
        go wait.Until(nc.worker, time.Second, stopCh)
    }
    
    <-stopCh
}
2. Service Controller
apiVersion: v1
kind: Service
metadata:
  name: cloud-lb
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-type: nlb
    service.beta.kubernetes.io/aws-load-balancer-internal: "true"
spec:
  type: LoadBalancer
  ports:
  - port: 80
    targetPort: 8080
  selector:
    app: web
Resource Management
1. Load Balancer Management
apiVersion: v1
kind: Service
metadata:
  name: cloud-lb
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-ssl-cert: "arn:aws:acm:region:account:certificate/cert-id"
    service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
    service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"
spec:
  type: LoadBalancer
  ports:
  - port: 443
    targetPort: 8080
  selector:
    app: web
2. Node Management
apiVersion: v1
kind: Node
metadata:
  name: cloud-node
  labels:
    failure-domain.beta.kubernetes.io/zone: us-east-1a
    node.kubernetes.io/instance-type: m5.large
spec:
  providerID: aws:///us-east-1a/i-0123456789abcdef0
Performance Tuning
1. Controller Settings
apiVersion: v1
kind: ConfigMap
metadata:
  name: cloud-controller-manager-config
  namespace: kube-system
data:
  config.yaml: |
    kind: KubeControllerManagerConfiguration
    apiVersion: controller-manager.config.k8s.io/v1alpha1
    controllers:
    - cloud-node
    - cloud-node-lifecycle
    - service
    - route
    leaderElection:
      leaderElect: true
    cloudProvider:
      name: aws
      cloudConfigFile: /etc/kubernetes/cloud.conf
2. Resource Configuration
apiVersion: v1
kind: Pod
metadata:
  name: cloud-controller-manager
  namespace: kube-system
spec:
  containers:
  - name: cloud-controller-manager
    resources:
      requests:
        cpu: "200m"
        memory: "512Mi"
      limits:
        cpu: "1"
        memory: "1Gi"
Monitoring and Metrics
1. Controller Metrics
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: cloud-controller-metrics
spec:
  endpoints:
  - interval: 30s
    port: metrics
  selector:
    matchLabels:
      k8s-app: cloud-controller-manager
2. Important Metrics
# Key metrics to monitor
cloudprovider_aws_api_request_duration_seconds
cloudprovider_aws_api_request_errors
cloud_controller_manager_workqueue_adds_total
cloud_controller_manager_workqueue_depth
Troubleshooting
Common Issues
- Load Balancer Issues
# Check service status
kubectl describe service cloud-lb
# View controller logs
kubectl logs -n kube-system cloud-controller-manager
# Check cloud provider events
kubectl get events --field-selector reason=FailedLoadBalancer
- Node Problems
# Verify node status
kubectl describe node cloud-node
# Check node controller logs
kubectl logs -n kube-system cloud-controller-manager -c cloud-node-controller
# View cloud provider node status
kubectl get nodes -o custom-columns=NAME:.metadata.name,PROVIDER-ID:.spec.providerID
- Route Issues
# Check route controller logs
kubectl logs -n kube-system cloud-controller-manager -c route-controller
# Verify route tables
kubectl get nodes -o custom-columns=NAME:.metadata.name,CIDR:.spec.podCIDR
Best Practices
- High Availability - Deploy multiple replicas
- Use proper leader election
- Configure proper timeouts
- Implement health checks
 
- Security - Use IAM roles/service accounts
- Limit cloud provider permissions
- Enable audit logging
- Implement network policies
 
- Performance - Configure rate limiting
- Set appropriate timeouts
- Monitor API quotas
- Use caching effectively
 
Advanced Configuration
1. Custom Cloud Provider
// Custom cloud provider implementation
type CustomCloudProvider struct {
    client    *CustomClient
    nodeInfos map[string]*NodeInfo
}
func (c *CustomCloudProvider) Initialize(clientBuilder controller.ControllerClientBuilder, stop <-chan struct{}) {
    c.client = NewCustomClient()
    go c.syncNodeInfos(stop)
}
func (c *CustomCloudProvider) ProviderName() string {
    return "custom-cloud"
}
2. Load Balancer Customization
apiVersion: v1
kind: Service
metadata:
  name: custom-lb
  annotations:
    custom.cloud.provider/lb-type: "application"
    custom.cloud.provider/target-type: "ip"
    custom.cloud.provider/subnet-selection: "private"
spec:
  type: LoadBalancer
  ports:
  - port: 443
    targetPort: 8080
For more information, check out:
