Part 28: Service Mesh Architecture - Modern Service Communication

"A service mesh is the nervous system of your microservices - it handles all the communication complexity so your services can focus on business logic."

Welcome to Part 28 of our distributed systems course! After mastering load balancing, we now explore service mesh architecture - the modern approach to managing service-to-service communication.

What is a Service Mesh?

A service mesh is a dedicated infrastructure layer that handles communication between services. It provides:

Traffic management: Load balancing, routing, retries
Security: mTLS, authentication, authorization
Observability: Metrics, tracing, logging
Resilience: Circuit breakers, timeouts, rate limiting

Core Concepts

The Sidecar Proxy Pattern

go
package servicemesh

import (
    "context"
    "crypto/tls"
    "crypto/x509"
    "fmt"
    "net/http"
    "sync"
    "time"
)

// Sidecar represents a sidecar proxy for a service
type Sidecar struct {
    serviceName   string
    servicePort   int
    proxyPort     int

    // Components
    loadBalancer  LoadBalancer
    circuitBreaker *CircuitBreaker
    rateLimiter   *RateLimiter
    tracer        *Tracer
    metricsClient *MetricsClient

    // TLS
    tlsConfig     *tls.Config
    certManager   *CertificateManager

    // Service discovery
    discovery     ServiceDiscovery

    // Configuration
    config        *SidecarConfig

    server        *http.Server
    mu            sync.RWMutex
}

// SidecarConfig holds sidecar configuration
type SidecarConfig struct {
    ServiceName       string
    ServicePort       int
    ProxyPort         int

    // Traffic management
    RetryAttempts     int
    RetryDelay        time.Duration
    Timeout           time.Duration

    // Circuit breaker
    CBThreshold       int
    CBTimeout         time.Duration

    // Rate limiting
    RateLimit         int
    RateLimitWindow   time.Duration

    // mTLS
    EnableMTLS        bool
    CertPath          string
    KeyPath           string
    CAPath            string
}

// NewSidecar creates a new sidecar proxy
func NewSidecar(cfg *SidecarConfig) (*Sidecar, error) {
    s := &Sidecar{
        serviceName: cfg.ServiceName,
        servicePort: cfg.ServicePort,
        proxyPort:   cfg.ProxyPort,
        config:      cfg,
    }

    // Initialize components
    s.loadBalancer = NewRoundRobinBalancer()
    s.circuitBreaker = NewCircuitBreaker(cfg.CBThreshold, cfg.CBTimeout)
    s.rateLimiter = NewRateLimiter(cfg.RateLimit, cfg.RateLimitWindow)
    s.tracer = NewTracer(cfg.ServiceName)
    s.metricsClient = NewMetricsClient()

    // Initialize mTLS if enabled
    if cfg.EnableMTLS {
        tlsConfig, err := s.setupMTLS(cfg)
        if err != nil {
            return nil, fmt.Errorf("mTLS setup failed: %w", err)
        }
        s.tlsConfig = tlsConfig
    }

    return s, nil
}

func (s *Sidecar) setupMTLS(cfg *SidecarConfig) (*tls.Config, error) {
    // Load certificate
    cert, err := tls.LoadX509KeyPair(cfg.CertPath, cfg.KeyPath)
    if err != nil {
        return nil, err
    }

    // Load CA certificate
    caCert, err := os.ReadFile(cfg.CAPath)
    if err != nil {
        return nil, err
    }

    caCertPool := x509.NewCertPool()
    caCertPool.AppendCertsFromPEM(caCert)

    return &tls.Config{
        Certificates: []tls.Certificate{cert},
        ClientCAs:    caCertPool,
        ClientAuth:   tls.RequireAndVerifyClientCert,
        MinVersion:   tls.VersionTLS12,
    }, nil
}

// Start starts the sidecar proxy
func (s *Sidecar) Start() error {
    mux := http.NewServeMux()

    // Inbound traffic handler
    mux.HandleFunc("/", s.handleInbound)

    // Outbound traffic handler
    mux.HandleFunc("/proxy/", s.handleOutbound)

    // Health and metrics endpoints
    mux.HandleFunc("/health", s.handleHealth)
    mux.HandleFunc("/metrics", s.handleMetrics)

    s.server = &http.Server{
        Addr:      fmt.Sprintf(":%d", s.proxyPort),
        Handler:   mux,
        TLSConfig: s.tlsConfig,
    }

    if s.config.EnableMTLS {
        return s.server.ListenAndServeTLS("", "")
    }
    return s.server.ListenAndServe()
}

// handleInbound handles incoming requests to the service
func (s *Sidecar) handleInbound(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()

    // Start tracing
    span := s.tracer.StartSpan(ctx, "inbound")
    defer span.End()

    // Record metrics
    start := time.Now()
    defer func() {
        s.metricsClient.RecordLatency("inbound", time.Since(start))
    }()

    // Rate limiting
    if !s.rateLimiter.Allow() {
        s.metricsClient.IncrementCounter("rate_limited")
        http.Error(w, "Rate limit exceeded", http.StatusTooManyRequests)
        return
    }

    // Forward to local service
    targetURL := fmt.Sprintf("http://localhost:%d%s", s.servicePort, r.URL.Path)
    proxyReq, err := http.NewRequestWithContext(ctx, r.Method, targetURL, r.Body)
    if err != nil {
        http.Error(w, err.Error(), http.StatusInternalServerError)
        return
    }

    // Copy headers
    for key, values := range r.Header {
        for _, value := range values {
            proxyReq.Header.Add(key, value)
        }
    }

    // Add tracing headers
    s.tracer.InjectHeaders(span, proxyReq.Header)

    resp, err := http.DefaultClient.Do(proxyReq)
    if err != nil {
        http.Error(w, err.Error(), http.StatusBadGateway)
        return
    }
    defer resp.Body.Close()

    // Copy response
    for key, values := range resp.Header {
        for _, value := range values {
            w.Header().Add(key, value)
        }
    }
    w.WriteHeader(resp.StatusCode)
    io.Copy(w, resp.Body)
}

// handleOutbound handles outgoing requests from the service
func (s *Sidecar) handleOutbound(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()

    // Extract target service from path
    // /proxy/{service}/{path}
    parts := strings.SplitN(strings.TrimPrefix(r.URL.Path, "/proxy/"), "/", 2)
    if len(parts) < 1 {
        http.Error(w, "Invalid proxy path", http.StatusBadRequest)
        return
    }

    targetService := parts[0]
    targetPath := "/"
    if len(parts) > 1 {
        targetPath = "/" + parts[1]
    }

    // Start tracing
    span := s.tracer.StartSpan(ctx, fmt.Sprintf("outbound:%s", targetService))
    defer span.End()

    // Record metrics
    start := time.Now()
    defer func() {
        s.metricsClient.RecordLatency(fmt.Sprintf("outbound:%s", targetService), time.Since(start))
    }()

    // Circuit breaker check
    if !s.circuitBreaker.Allow(targetService) {
        s.metricsClient.IncrementCounter(fmt.Sprintf("circuit_open:%s", targetService))
        http.Error(w, "Service unavailable", http.StatusServiceUnavailable)
        return
    }

    // Service discovery
    endpoints, err := s.discovery.GetEndpoints(ctx, targetService)
    if err != nil {
        http.Error(w, "Service not found", http.StatusServiceUnavailable)
        return
    }

    // Load balance and retry
    var lastErr error
    for attempt := 0; attempt <= s.config.RetryAttempts; attempt++ {
        endpoint := s.loadBalancer.Select(endpoints)

        targetURL := fmt.Sprintf("http://%s%s", endpoint, targetPath)
        proxyReq, err := http.NewRequestWithContext(ctx, r.Method, targetURL, r.Body)
        if err != nil {
            lastErr = err
            continue
        }

        // Copy and add headers
        for key, values := range r.Header {
            for _, value := range values {
                proxyReq.Header.Add(key, value)
            }
        }
        s.tracer.InjectHeaders(span, proxyReq.Header)

        // Make request with timeout
        client := &http.Client{Timeout: s.config.Timeout}
        resp, err := client.Do(proxyReq)

        if err != nil {
            lastErr = err
            s.circuitBreaker.RecordFailure(targetService)
            time.Sleep(s.config.RetryDelay)
            continue
        }

        // Success
        s.circuitBreaker.RecordSuccess(targetService)

        // Copy response
        defer resp.Body.Close()
        for key, values := range resp.Header {
            for _, value := range values {
                w.Header().Add(key, value)
            }
        }
        w.WriteHeader(resp.StatusCode)
        io.Copy(w, resp.Body)
        return
    }

    http.Error(w, fmt.Sprintf("All retries failed: %v", lastErr), http.StatusBadGateway)
}

Service Discovery Integration

go
// ServiceDiscovery provides service endpoint resolution
type ServiceDiscovery interface {
    GetEndpoints(ctx context.Context, service string) ([]string, error)
    Watch(ctx context.Context, service string, callback func([]string))
}

// KubernetesDiscovery discovers services from Kubernetes
type KubernetesDiscovery struct {
    clientset *kubernetes.Clientset
    namespace string
    cache     sync.Map
}

// NewKubernetesDiscovery creates a Kubernetes service discovery
func NewKubernetesDiscovery(namespace string) (*KubernetesDiscovery, error) {
    config, err := rest.InClusterConfig()
    if err != nil {
        return nil, err
    }

    clientset, err := kubernetes.NewForConfig(config)
    if err != nil {
        return nil, err
    }

    return &KubernetesDiscovery{
        clientset: clientset,
        namespace: namespace,
    }, nil
}

func (kd *KubernetesDiscovery) GetEndpoints(ctx context.Context, service string) ([]string, error) {
    // Check cache first
    if cached, ok := kd.cache.Load(service); ok {
        return cached.([]string), nil
    }

    // Query Kubernetes
    endpoints, err := kd.clientset.CoreV1().Endpoints(kd.namespace).Get(ctx, service, metav1.GetOptions{})
    if err != nil {
        return nil, err
    }

    var addrs []string
    for _, subset := range endpoints.Subsets {
        for _, addr := range subset.Addresses {
            for _, port := range subset.Ports {
                addrs = append(addrs, fmt.Sprintf("%s:%d", addr.IP, port.Port))
            }
        }
    }

    // Update cache
    kd.cache.Store(service, addrs)

    return addrs, nil
}

func (kd *KubernetesDiscovery) Watch(ctx context.Context, service string, callback func([]string)) {
    watcher, err := kd.clientset.CoreV1().Endpoints(kd.namespace).Watch(ctx, metav1.ListOptions{
        FieldSelector: fmt.Sprintf("metadata.name=%s", service),
    })
    if err != nil {
        return
    }
    defer watcher.Stop()

    for event := range watcher.ResultChan() {
        if endpoints, ok := event.Object.(*v1.Endpoints); ok {
            var addrs []string
            for _, subset := range endpoints.Subsets {
                for _, addr := range subset.Addresses {
                    for _, port := range subset.Ports {
                        addrs = append(addrs, fmt.Sprintf("%s:%d", addr.IP, port.Port))
                    }
                }
            }

            kd.cache.Store(service, addrs)
            callback(addrs)
        }
    }
}

Traffic Management

Canary Deployments

go
// TrafficRouter handles traffic splitting
type TrafficRouter struct {
    rules []RoutingRule
    mu    sync.RWMutex
}

// RoutingRule defines traffic routing
type RoutingRule struct {
    Service     string
    Subset      string           // e.g., "v1", "v2", "canary"
    Weight      int              // Percentage 0-100
    Headers     map[string]string // Header-based routing
    Endpoints   []string
}

// NewTrafficRouter creates a traffic router
func NewTrafficRouter() *TrafficRouter {
    return &TrafficRouter{
        rules: make([]RoutingRule, 0),
    }
}

// AddRule adds a routing rule
func (tr *TrafficRouter) AddRule(rule RoutingRule) {
    tr.mu.Lock()
    defer tr.mu.Unlock()
    tr.rules = append(tr.rules, rule)
}

// Route selects endpoints based on rules
func (tr *TrafficRouter) Route(service string, headers http.Header) []string {
    tr.mu.RLock()
    defer tr.mu.RUnlock()

    // Check header-based rules first
    for _, rule := range tr.rules {
        if rule.Service != service {
            continue
        }

        if len(rule.Headers) > 0 {
            match := true
            for key, value := range rule.Headers {
                if headers.Get(key) != value {
                    match = false
                    break
                }
            }
            if match {
                return rule.Endpoints
            }
        }
    }

    // Weight-based routing
    var weightedRules []RoutingRule
    totalWeight := 0

    for _, rule := range tr.rules {
        if rule.Service == service && len(rule.Headers) == 0 {
            weightedRules = append(weightedRules, rule)
            totalWeight += rule.Weight
        }
    }

    if len(weightedRules) == 0 {
        return nil
    }

    // Random selection based on weight
    rand := rand.Intn(totalWeight)
    cumulative := 0

    for _, rule := range weightedRules {
        cumulative += rule.Weight
        if rand < cumulative {
            return rule.Endpoints
        }
    }

    return weightedRules[0].Endpoints
}

// Example: Configure canary deployment
func configureCanary(router *TrafficRouter) {
    // 90% to stable version
    router.AddRule(RoutingRule{
        Service:   "payment-service",
        Subset:    "stable",
        Weight:    90,
        Endpoints: []string{"payment-v1:8080"},
    })

    // 10% to canary
    router.AddRule(RoutingRule{
        Service:   "payment-service",
        Subset:    "canary",
        Weight:    10,
        Endpoints: []string{"payment-v2:8080"},
    })

    // Beta testers always get canary
    router.AddRule(RoutingRule{
        Service:   "payment-service",
        Subset:    "canary",
        Headers:   map[string]string{"X-Beta-User": "true"},
        Endpoints: []string{"payment-v2:8080"},
    })
}

Request Mirroring

go
// RequestMirror mirrors requests to secondary endpoints
type RequestMirror struct {
    primaryEndpoint   string
    mirrorEndpoints   []string
    mirrorPercentage  int // 0-100
}

// NewRequestMirror creates a request mirror
func NewRequestMirror(primary string, mirrors []string, percentage int) *RequestMirror {
    return &RequestMirror{
        primaryEndpoint:  primary,
        mirrorEndpoints:  mirrors,
        mirrorPercentage: percentage,
    }
}

// Forward sends request to primary and optionally mirrors
func (rm *RequestMirror) Forward(ctx context.Context, req *http.Request) (*http.Response, error) {
    // Send to primary
    primaryReq := req.Clone(ctx)
    primaryReq.URL.Host = rm.primaryEndpoint

    // Maybe mirror (async, don't wait for response)
    if rand.Intn(100) < rm.mirrorPercentage {
        for _, mirror := range rm.mirrorEndpoints {
            go func(endpoint string) {
                mirrorCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
                defer cancel()

                mirrorReq := req.Clone(mirrorCtx)
                mirrorReq.URL.Host = endpoint
                mirrorReq.Header.Set("X-Mirror-Request", "true")

                // Fire and forget
                resp, err := http.DefaultClient.Do(mirrorReq)
                if err == nil {
                    resp.Body.Close()
                }
            }(mirror)
        }
    }

    return http.DefaultClient.Do(primaryReq)
}

Security: mTLS and Authorization

go
// CertificateManager manages service certificates
type CertificateManager struct {
    ca           *x509.Certificate
    caKey        *ecdsa.PrivateKey
    certificates sync.Map
}

// NewCertificateManager creates a certificate manager
func NewCertificateManager(caPath, caKeyPath string) (*CertificateManager, error) {
    // Load CA certificate and key
    caCertPEM, err := os.ReadFile(caPath)
    if err != nil {
        return nil, err
    }

    caKeyPEM, err := os.ReadFile(caKeyPath)
    if err != nil {
        return nil, err
    }

    block, _ := pem.Decode(caCertPEM)
    ca, err := x509.ParseCertificate(block.Bytes)
    if err != nil {
        return nil, err
    }

    keyBlock, _ := pem.Decode(caKeyPEM)
    caKey, err := x509.ParseECPrivateKey(keyBlock.Bytes)
    if err != nil {
        return nil, err
    }

    return &CertificateManager{
        ca:    ca,
        caKey: caKey,
    }, nil
}

// GenerateCertificate generates a certificate for a service
func (cm *CertificateManager) GenerateCertificate(serviceName string) (*tls.Certificate, error) {
    // Generate key pair
    privateKey, err := ecdsa.GenerateKey(elliptic.P256(), crypto_rand.Reader)
    if err != nil {
        return nil, err
    }

    // Create certificate template
    template := &x509.Certificate{
        SerialNumber: big.NewInt(time.Now().UnixNano()),
        Subject: pkix.Name{
            CommonName:   serviceName,
            Organization: []string{"ServiceMesh"},
        },
        NotBefore:             time.Now(),
        NotAfter:              time.Now().Add(24 * time.Hour),
        KeyUsage:              x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment,
        ExtKeyUsage:           []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth, x509.ExtKeyUsageServerAuth},
        BasicConstraintsValid: true,
        DNSNames:              []string{serviceName, fmt.Sprintf("%s.default.svc.cluster.local", serviceName)},
    }

    // Sign certificate
    certDER, err := x509.CreateCertificate(crypto_rand.Reader, template, cm.ca, &privateKey.PublicKey, cm.caKey)
    if err != nil {
        return nil, err
    }

    cert := &tls.Certificate{
        Certificate: [][]byte{certDER},
        PrivateKey:  privateKey,
    }

    cm.certificates.Store(serviceName, cert)

    return cert, nil
}

// Authorization policies
type AuthorizationPolicy struct {
    Name      string
    Namespace string
    Rules     []AuthRule
}

type AuthRule struct {
    From []Source
    To   []Operation
    When []Condition
}

type Source struct {
    Principals []string // Service identities
    Namespaces []string
}

type Operation struct {
    Methods []string
    Paths   []string
    Ports   []int
}

type Condition struct {
    Key    string
    Values []string
}

// AuthorizationEnforcer enforces authorization policies
type AuthorizationEnforcer struct {
    policies []AuthorizationPolicy
    mu       sync.RWMutex
}

// NewAuthorizationEnforcer creates an authorization enforcer
func NewAuthorizationEnforcer() *AuthorizationEnforcer {
    return &AuthorizationEnforcer{
        policies: make([]AuthorizationPolicy, 0),
    }
}

// AddPolicy adds an authorization policy
func (ae *AuthorizationEnforcer) AddPolicy(policy AuthorizationPolicy) {
    ae.mu.Lock()
    defer ae.mu.Unlock()
    ae.policies = append(ae.policies, policy)
}

// Authorize checks if a request is authorized
func (ae *AuthorizationEnforcer) Authorize(req *AuthorizationRequest) (bool, error) {
    ae.mu.RLock()
    defer ae.mu.RUnlock()

    for _, policy := range ae.policies {
        if policy.Namespace != req.Namespace && policy.Namespace != "*" {
            continue
        }

        for _, rule := range policy.Rules {
            if ae.matchesSource(rule.From, req) &&
                ae.matchesOperation(rule.To, req) &&
                ae.matchesConditions(rule.When, req) {
                return true, nil
            }
        }
    }

    return false, nil
}

type AuthorizationRequest struct {
    SourceService   string
    SourceNamespace string
    TargetService   string
    Namespace       string
    Method          string
    Path            string
    Port            int
    Headers         http.Header
}

func (ae *AuthorizationEnforcer) matchesSource(sources []Source, req *AuthorizationRequest) bool {
    if len(sources) == 0 {
        return true
    }

    for _, source := range sources {
        for _, principal := range source.Principals {
            if principal == "*" || principal == req.SourceService {
                return true
            }
        }
        for _, ns := range source.Namespaces {
            if ns == "*" || ns == req.SourceNamespace {
                return true
            }
        }
    }

    return false
}

func (ae *AuthorizationEnforcer) matchesOperation(ops []Operation, req *AuthorizationRequest) bool {
    if len(ops) == 0 {
        return true
    }

    for _, op := range ops {
        methodMatch := len(op.Methods) == 0
        for _, m := range op.Methods {
            if m == "*" || m == req.Method {
                methodMatch = true
                break
            }
        }

        pathMatch := len(op.Paths) == 0
        for _, p := range op.Paths {
            if p == "*" || strings.HasPrefix(req.Path, p) {
                pathMatch = true
                break
            }
        }

        if methodMatch && pathMatch {
            return true
        }
    }

    return false
}

func (ae *AuthorizationEnforcer) matchesConditions(conditions []Condition, req *AuthorizationRequest) bool {
    if len(conditions) == 0 {
        return true
    }

    for _, cond := range conditions {
        headerValue := req.Headers.Get(cond.Key)
        match := false
        for _, v := range cond.Values {
            if v == headerValue {
                match = true
                break
            }
        }
        if !match {
            return false
        }
    }

    return true
}

Observability

Distributed Tracing

go
// Tracer provides distributed tracing
type Tracer struct {
    serviceName string
    spans       sync.Map
}

// Span represents a trace span
type Span struct {
    TraceID     string
    SpanID      string
    ParentID    string
    Name        string
    Service     string
    StartTime   time.Time
    EndTime     time.Time
    Tags        map[string]string
    Logs        []SpanLog
    mu          sync.Mutex
}

type SpanLog struct {
    Timestamp time.Time
    Message   string
}

// NewTracer creates a new tracer
func NewTracer(serviceName string) *Tracer {
    return &Tracer{
        serviceName: serviceName,
    }
}

// StartSpan starts a new span
func (t *Tracer) StartSpan(ctx context.Context, name string) *Span {
    var traceID, parentID string

    // Check for existing trace context
    if existing := ctx.Value("trace_id"); existing != nil {
        traceID = existing.(string)
        if parent := ctx.Value("span_id"); parent != nil {
            parentID = parent.(string)
        }
    } else {
        traceID = generateID()
    }

    span := &Span{
        TraceID:   traceID,
        SpanID:    generateID(),
        ParentID:  parentID,
        Name:      name,
        Service:   t.serviceName,
        StartTime: time.Now(),
        Tags:      make(map[string]string),
    }

    t.spans.Store(span.SpanID, span)

    return span
}

// ExtractContext extracts trace context from headers
func (t *Tracer) ExtractContext(headers http.Header) context.Context {
    ctx := context.Background()

    if traceID := headers.Get("X-Trace-ID"); traceID != "" {
        ctx = context.WithValue(ctx, "trace_id", traceID)
    }
    if spanID := headers.Get("X-Span-ID"); spanID != "" {
        ctx = context.WithValue(ctx, "span_id", spanID)
    }

    return ctx
}

// InjectHeaders injects trace context into headers
func (t *Tracer) InjectHeaders(span *Span, headers http.Header) {
    headers.Set("X-Trace-ID", span.TraceID)
    headers.Set("X-Span-ID", span.SpanID)
    if span.ParentID != "" {
        headers.Set("X-Parent-Span-ID", span.ParentID)
    }
}

// End ends a span
func (s *Span) End() {
    s.mu.Lock()
    defer s.mu.Unlock()
    s.EndTime = time.Now()
}

// SetTag sets a tag on the span
func (s *Span) SetTag(key, value string) {
    s.mu.Lock()
    defer s.mu.Unlock()
    s.Tags[key] = value
}

// Log adds a log entry to the span
func (s *Span) Log(message string) {
    s.mu.Lock()
    defer s.mu.Unlock()
    s.Logs = append(s.Logs, SpanLog{
        Timestamp: time.Now(),
        Message:   message,
    })
}

func generateID() string {
    b := make([]byte, 16)
    crypto_rand.Read(b)
    return hex.EncodeToString(b)
}

Metrics Collection

go
// MetricsClient collects service mesh metrics
type MetricsClient struct {
    counters   sync.Map
    histograms sync.Map
    gauges     sync.Map
}

// NewMetricsClient creates a metrics client
func NewMetricsClient() *MetricsClient {
    return &MetricsClient{}
}

// IncrementCounter increments a counter
func (m *MetricsClient) IncrementCounter(name string) {
    actual, _ := m.counters.LoadOrStore(name, new(int64))
    atomic.AddInt64(actual.(*int64), 1)
}

// RecordLatency records a latency observation
func (m *MetricsClient) RecordLatency(name string, duration time.Duration) {
    actual, _ := m.histograms.LoadOrStore(name, &LatencyHistogram{})
    actual.(*LatencyHistogram).Observe(duration)
}

// SetGauge sets a gauge value
func (m *MetricsClient) SetGauge(name string, value float64) {
    m.gauges.Store(name, value)
}

// LatencyHistogram tracks latency distribution
type LatencyHistogram struct {
    sum     int64
    count   int64
    buckets [10]int64 // p50, p75, p90, p95, p99, etc.
    mu      sync.Mutex
}

func (h *LatencyHistogram) Observe(d time.Duration) {
    ms := d.Milliseconds()
    atomic.AddInt64(&h.sum, ms)
    atomic.AddInt64(&h.count, 1)

    // Update buckets
    h.mu.Lock()
    defer h.mu.Unlock()

    thresholds := []int64{1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000}
    for i, threshold := range thresholds {
        if ms <= threshold {
            h.buckets[i]++
            break
        }
    }
}

// GetMetrics returns all metrics
func (m *MetricsClient) GetMetrics() map[string]interface{} {
    metrics := make(map[string]interface{})

    m.counters.Range(func(key, value interface{}) bool {
        metrics[fmt.Sprintf("counter_%s", key)] = atomic.LoadInt64(value.(*int64))
        return true
    })

    m.histograms.Range(func(key, value interface{}) bool {
        h := value.(*LatencyHistogram)
        count := atomic.LoadInt64(&h.count)
        if count > 0 {
            sum := atomic.LoadInt64(&h.sum)
            metrics[fmt.Sprintf("latency_%s_avg_ms", key)] = float64(sum) / float64(count)
        }
        return true
    })

    m.gauges.Range(func(key, value interface{}) bool {
        metrics[fmt.Sprintf("gauge_%s", key)] = value
        return true
    })

    return metrics
}

Control Plane

go
// ControlPlane manages service mesh configuration
type ControlPlane struct {
    // Configuration
    routingRules    sync.Map
    rateLimitRules  sync.Map
    authPolicies    sync.Map

    // Service registry
    services        sync.Map

    // Connected proxies
    proxies         sync.Map

    // Metrics aggregation
    metricsStore    *MetricsStore
}

// NewControlPlane creates a control plane
func NewControlPlane() *ControlPlane {
    return &ControlPlane{
        metricsStore: NewMetricsStore(),
    }
}

// RegisterProxy registers a sidecar proxy
func (cp *ControlPlane) RegisterProxy(proxyID string, info ProxyInfo) {
    cp.proxies.Store(proxyID, info)
}

// PushConfig pushes configuration to a proxy
func (cp *ControlPlane) PushConfig(proxyID string) (*ProxyConfig, error) {
    config := &ProxyConfig{
        RoutingRules:   cp.getRoutingRules(),
        RateLimitRules: cp.getRateLimitRules(),
        AuthPolicies:   cp.getAuthPolicies(),
        Services:       cp.getServices(),
    }

    return config, nil
}

type ProxyInfo struct {
    ID          string
    Service     string
    Address     string
    Version     string
    LastSeen    time.Time
}

type ProxyConfig struct {
    RoutingRules   []RoutingRule
    RateLimitRules []RateLimitRule
    AuthPolicies   []AuthorizationPolicy
    Services       []ServiceInfo
}

type ServiceInfo struct {
    Name      string
    Endpoints []string
    Port      int
    Protocol  string
}

func (cp *ControlPlane) getRoutingRules() []RoutingRule {
    var rules []RoutingRule
    cp.routingRules.Range(func(key, value interface{}) bool {
        rules = append(rules, value.(RoutingRule))
        return true
    })
    return rules
}

func (cp *ControlPlane) getRateLimitRules() []RateLimitRule {
    var rules []RateLimitRule
    cp.rateLimitRules.Range(func(key, value interface{}) bool {
        rules = append(rules, value.(RateLimitRule))
        return true
    })
    return rules
}

func (cp *ControlPlane) getAuthPolicies() []AuthorizationPolicy {
    var policies []AuthorizationPolicy
    cp.authPolicies.Range(func(key, value interface{}) bool {
        policies = append(policies, value.(AuthorizationPolicy))
        return true
    })
    return policies
}

func (cp *ControlPlane) getServices() []ServiceInfo {
    var services []ServiceInfo
    cp.services.Range(func(key, value interface{}) bool {
        services = append(services, value.(ServiceInfo))
        return true
    })
    return services
}

// MetricsStore aggregates metrics from all proxies
type MetricsStore struct {
    metrics sync.Map
}

func NewMetricsStore() *MetricsStore {
    return &MetricsStore{}
}

func (ms *MetricsStore) Store(proxyID string, metrics map[string]interface{}) {
    ms.metrics.Store(proxyID, metrics)
}

func (ms *MetricsStore) GetAggregated() map[string]interface{} {
    aggregated := make(map[string]interface{})

    ms.metrics.Range(func(key, value interface{}) bool {
        proxyMetrics := value.(map[string]interface{})
        for name, val := range proxyMetrics {
            if existing, ok := aggregated[name]; ok {
                // Sum numeric values
                switch v := val.(type) {
                case int64:
                    aggregated[name] = existing.(int64) + v
                case float64:
                    aggregated[name] = existing.(float64) + v
                }
            } else {
                aggregated[name] = val
            }
        }
        return true
    })

    return aggregated
}

Best Practices

Start simple - Begin with basic traffic management, add features gradually
Monitor everything - Service meshes provide rich observability; use it
Test resilience - Inject failures to verify circuit breakers and retries work
Secure by default - Enable mTLS for all service-to-service communication
Manage configuration centrally - Use the control plane for consistent policy enforcement

What's Next?

In Part 29, we'll explore Distributed Tracing Deep Dive - implementing comprehensive tracing across your distributed system.

"A service mesh doesn't add complexity - it manages the complexity that already exists in your distributed system."