Part 30: Building Production-Ready Distributed Systems

"A system isn't production-ready when it works. It's production-ready when it works reliably, fails gracefully, and can be understood, monitored, and maintained by your team."

Welcome to Part 30 - the final chapter of our distributed systems course! We'll bring together everything we've learned to build truly production-ready systems.

The Production-Ready Checklist

Before deploying any distributed system, ensure you've addressed these areas:

Reliability: Handles failures gracefully
Scalability: Grows with demand
Observability: You can see what's happening
Security: Protected against threats
Operability: Easy to deploy, configure, and maintain

Complete Service Template

Let's build a production-ready service that incorporates all our learnings:

go
package main

import (
    "context"
    "database/sql"
    "encoding/json"
    "errors"
    "fmt"
    "log"
    "net/http"
    "os"
    "os/signal"
    "sync"
    "syscall"
    "time"

    _ "github.com/lib/pq"
)

// Config holds all service configuration
type Config struct {
    // Server
    ServerPort        int           `env:"SERVER_PORT" default:"8080"`
    ServerReadTimeout time.Duration `env:"SERVER_READ_TIMEOUT" default:"5s"`
    ServerWriteTimeout time.Duration `env:"SERVER_WRITE_TIMEOUT" default:"10s"`
    ShutdownTimeout   time.Duration `env:"SHUTDOWN_TIMEOUT" default:"30s"`

    // Database
    DatabaseURL       string        `env:"DATABASE_URL" required:"true"`
    DatabaseMaxConns  int           `env:"DATABASE_MAX_CONNS" default:"25"`
    DatabaseMaxIdle   int           `env:"DATABASE_MAX_IDLE" default:"10"`
    DatabaseConnLife  time.Duration `env:"DATABASE_CONN_LIFE" default:"5m"`

    // Resilience
    CircuitBreakerThreshold int           `env:"CB_THRESHOLD" default:"5"`
    CircuitBreakerTimeout   time.Duration `env:"CB_TIMEOUT" default:"30s"`
    RateLimitRPS           int           `env:"RATE_LIMIT_RPS" default:"100"`
    BulkheadSize           int           `env:"BULKHEAD_SIZE" default:"50"`

    // Observability
    TracingEndpoint   string  `env:"TRACING_ENDPOINT"`
    TracingSampleRate float64 `env:"TRACING_SAMPLE_RATE" default:"0.1"`
    MetricsPort       int     `env:"METRICS_PORT" default:"9090"`

    // Feature Flags
    EnableNewFeature bool `env:"ENABLE_NEW_FEATURE" default:"false"`
}

// Service is the main application
type Service struct {
    config     *Config
    server     *http.Server
    db         *sql.DB

    // Resilience components
    circuitBreaker *CircuitBreaker
    rateLimiter    *RateLimiter
    bulkhead       *Bulkhead

    // Observability
    tracer     *Tracer
    metrics    *Metrics
    logger     *Logger

    // Health
    health     *HealthChecker

    // Lifecycle
    shutdown   chan struct{}
    wg         sync.WaitGroup
}

// NewService creates a new service instance
func NewService(cfg *Config) (*Service, error) {
    svc := &Service{
        config:   cfg,
        shutdown: make(chan struct{}),
    }

    // Initialize components
    var err error

    // Database
    svc.db, err = svc.initDatabase()
    if err != nil {
        return nil, fmt.Errorf("database init failed: %w", err)
    }

    // Resilience
    svc.circuitBreaker = NewCircuitBreaker(cfg.CircuitBreakerThreshold, cfg.CircuitBreakerTimeout)
    svc.rateLimiter = NewRateLimiter(cfg.RateLimitRPS, time.Second)
    svc.bulkhead = NewBulkhead(cfg.BulkheadSize, time.Second)

    // Observability
    svc.logger = NewLogger("service")
    svc.metrics = NewMetrics()
    svc.tracer, err = NewTracer(cfg.TracingEndpoint, cfg.TracingSampleRate)
    if err != nil {
        svc.logger.Warn("tracing disabled", "error", err)
    }

    // Health checker
    svc.health = NewHealthChecker()
    svc.health.AddCheck("database", svc.checkDatabase)

    // HTTP server
    svc.server = svc.initServer()

    return svc, nil
}

func (s *Service) initDatabase() (*sql.DB, error) {
    db, err := sql.Open("postgres", s.config.DatabaseURL)
    if err != nil {
        return nil, err
    }

    db.SetMaxOpenConns(s.config.DatabaseMaxConns)
    db.SetMaxIdleConns(s.config.DatabaseMaxIdle)
    db.SetConnMaxLifetime(s.config.DatabaseConnLife)

    // Verify connection
    ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
    defer cancel()

    if err := db.PingContext(ctx); err != nil {
        return nil, err
    }

    return db, nil
}

func (s *Service) initServer() *http.Server {
    mux := http.NewServeMux()

    // Apply middleware stack
    handler := s.applyMiddleware(mux)

    // Register routes
    s.registerRoutes(mux)

    return &http.Server{
        Addr:         fmt.Sprintf(":%d", s.config.ServerPort),
        Handler:      handler,
        ReadTimeout:  s.config.ServerReadTimeout,
        WriteTimeout: s.config.ServerWriteTimeout,
    }
}

func (s *Service) applyMiddleware(handler http.Handler) http.Handler {
    // Apply in reverse order (last applied runs first)

    // Recovery - catch panics
    handler = s.recoveryMiddleware(handler)

    // Tracing
    handler = s.tracingMiddleware(handler)

    // Metrics
    handler = s.metricsMiddleware(handler)

    // Rate limiting
    handler = s.rateLimitMiddleware(handler)

    // Request ID
    handler = s.requestIDMiddleware(handler)

    // Logging
    handler = s.loggingMiddleware(handler)

    return handler
}

func (s *Service) registerRoutes(mux *http.ServeMux) {
    // Health endpoints
    mux.HandleFunc("/health", s.handleHealth)
    mux.HandleFunc("/ready", s.handleReady)

    // API endpoints
    mux.HandleFunc("/api/v1/orders", s.handleOrders)
    mux.HandleFunc("/api/v1/orders/", s.handleOrder)
}

// Start starts the service
func (s *Service) Start() error {
    s.logger.Info("starting service", "port", s.config.ServerPort)

    // Start background tasks
    s.wg.Add(1)
    go s.runBackgroundTasks()

    // Start metrics server
    s.wg.Add(1)
    go s.runMetricsServer()

    // Start HTTP server
    go func() {
        if err := s.server.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
            s.logger.Error("server error", "error", err)
        }
    }()

    // Wait for shutdown signal
    s.waitForShutdown()

    return nil
}

func (s *Service) waitForShutdown() {
    sigCh := make(chan os.Signal, 1)
    signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)

    <-sigCh
    s.logger.Info("shutdown signal received")

    // Signal all goroutines to stop
    close(s.shutdown)

    // Create shutdown context
    ctx, cancel := context.WithTimeout(context.Background(), s.config.ShutdownTimeout)
    defer cancel()

    // Shutdown HTTP server
    if err := s.server.Shutdown(ctx); err != nil {
        s.logger.Error("server shutdown error", "error", err)
    }

    // Wait for background tasks
    done := make(chan struct{})
    go func() {
        s.wg.Wait()
        close(done)
    }()

    select {
    case <-done:
        s.logger.Info("graceful shutdown complete")
    case <-ctx.Done():
        s.logger.Warn("shutdown timeout exceeded")
    }

    // Close database
    if err := s.db.Close(); err != nil {
        s.logger.Error("database close error", "error", err)
    }

    // Shutdown tracer
    if s.tracer != nil {
        s.tracer.Shutdown(ctx)
    }
}

func (s *Service) runBackgroundTasks() {
    defer s.wg.Done()

    ticker := time.NewTicker(1 * time.Minute)
    defer ticker.Stop()

    for {
        select {
        case <-ticker.C:
            s.performMaintenance()
        case <-s.shutdown:
            return
        }
    }
}

func (s *Service) performMaintenance() {
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    // Example: Clean up expired data
    _, err := s.db.ExecContext(ctx, "DELETE FROM sessions WHERE expires_at < NOW()")
    if err != nil {
        s.logger.Error("maintenance task failed", "error", err)
    }
}

func (s *Service) runMetricsServer() {
    defer s.wg.Done()

    mux := http.NewServeMux()
    mux.HandleFunc("/metrics", s.handleMetrics)

    server := &http.Server{
        Addr:    fmt.Sprintf(":%d", s.config.MetricsPort),
        Handler: mux,
    }

    go func() {
        if err := server.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
            s.logger.Error("metrics server error", "error", err)
        }
    }()

    <-s.shutdown
    server.Shutdown(context.Background())
}

Middleware Implementation

go
// Request ID middleware
func (s *Service) requestIDMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        requestID := r.Header.Get("X-Request-ID")
        if requestID == "" {
            requestID = generateRequestID()
        }

        ctx := context.WithValue(r.Context(), "request_id", requestID)
        w.Header().Set("X-Request-ID", requestID)

        next.ServeHTTP(w, r.WithContext(ctx))
    })
}

// Logging middleware
func (s *Service) loggingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()

        // Wrap response writer
        wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}

        next.ServeHTTP(wrapped, r)

        duration := time.Since(start)
        requestID := r.Context().Value("request_id").(string)

        s.logger.Info("request completed",
            "request_id", requestID,
            "method", r.Method,
            "path", r.URL.Path,
            "status", wrapped.statusCode,
            "duration_ms", duration.Milliseconds(),
            "bytes", wrapped.bytesWritten,
        )
    })
}

// Rate limit middleware
func (s *Service) rateLimitMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        if !s.rateLimiter.Allow() {
            s.metrics.IncrementCounter("rate_limited_requests")
            w.Header().Set("Retry-After", "1")
            http.Error(w, "Rate limit exceeded", http.StatusTooManyRequests)
            return
        }

        next.ServeHTTP(w, r)
    })
}

// Metrics middleware
func (s *Service) metricsMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        start := time.Now()

        wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
        next.ServeHTTP(wrapped, r)

        duration := time.Since(start)

        s.metrics.RecordLatency("http_request_duration", r.URL.Path, duration)
        s.metrics.IncrementCounterWithLabels("http_requests_total",
            map[string]string{
                "method": r.Method,
                "path":   r.URL.Path,
                "status": fmt.Sprintf("%d", wrapped.statusCode),
            })
    })
}

// Tracing middleware
func (s *Service) tracingMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        if s.tracer == nil {
            next.ServeHTTP(w, r)
            return
        }

        ctx := s.tracer.Extract(r.Context(), r.Header)
        ctx, span := s.tracer.StartSpan(ctx, fmt.Sprintf("%s %s", r.Method, r.URL.Path))
        defer span.Finish()

        span.SetTag("http.method", r.Method)
        span.SetTag("http.url", r.URL.String())

        wrapped := &responseWriter{ResponseWriter: w, statusCode: 200}
        next.ServeHTTP(wrapped, r.WithContext(ctx))

        span.SetTag("http.status_code", fmt.Sprintf("%d", wrapped.statusCode))
        if wrapped.statusCode >= 400 {
            span.SetStatus(SpanStatusError)
        }
    })
}

// Recovery middleware
func (s *Service) recoveryMiddleware(next http.Handler) http.Handler {
    return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        defer func() {
            if err := recover(); err != nil {
                s.logger.Error("panic recovered",
                    "error", err,
                    "stack", string(debug.Stack()),
                )
                s.metrics.IncrementCounter("panics_total")
                http.Error(w, "Internal Server Error", http.StatusInternalServerError)
            }
        }()

        next.ServeHTTP(w, r)
    })
}

type responseWriter struct {
    http.ResponseWriter
    statusCode   int
    bytesWritten int
}

func (w *responseWriter) WriteHeader(code int) {
    w.statusCode = code
    w.ResponseWriter.WriteHeader(code)
}

func (w *responseWriter) Write(b []byte) (int, error) {
    n, err := w.ResponseWriter.Write(b)
    w.bytesWritten += n
    return n, err
}

Health Checks

go
// HealthChecker manages health checks
type HealthChecker struct {
    checks map[string]HealthCheckFunc
    mu     sync.RWMutex
}

type HealthCheckFunc func(ctx context.Context) error

type HealthStatus struct {
    Status    string            `json:"status"`
    Checks    map[string]string `json:"checks"`
    Timestamp time.Time         `json:"timestamp"`
}

func NewHealthChecker() *HealthChecker {
    return &HealthChecker{
        checks: make(map[string]HealthCheckFunc),
    }
}

func (hc *HealthChecker) AddCheck(name string, check HealthCheckFunc) {
    hc.mu.Lock()
    defer hc.mu.Unlock()
    hc.checks[name] = check
}

func (hc *HealthChecker) Check(ctx context.Context) HealthStatus {
    hc.mu.RLock()
    defer hc.mu.RUnlock()

    status := HealthStatus{
        Status:    "healthy",
        Checks:    make(map[string]string),
        Timestamp: time.Now(),
    }

    var wg sync.WaitGroup
    var mu sync.Mutex

    for name, check := range hc.checks {
        wg.Add(1)
        go func(name string, check HealthCheckFunc) {
            defer wg.Done()

            checkCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
            defer cancel()

            err := check(checkCtx)

            mu.Lock()
            if err != nil {
                status.Checks[name] = fmt.Sprintf("unhealthy: %v", err)
                status.Status = "unhealthy"
            } else {
                status.Checks[name] = "healthy"
            }
            mu.Unlock()
        }(name, check)
    }

    wg.Wait()
    return status
}

func (s *Service) checkDatabase(ctx context.Context) error {
    return s.db.PingContext(ctx)
}

func (s *Service) handleHealth(w http.ResponseWriter, r *http.Request) {
    status := s.health.Check(r.Context())

    w.Header().Set("Content-Type", "application/json")

    if status.Status != "healthy" {
        w.WriteHeader(http.StatusServiceUnavailable)
    }

    json.NewEncoder(w).Encode(status)
}

func (s *Service) handleReady(w http.ResponseWriter, r *http.Request) {
    // Check if service is ready to receive traffic
    if err := s.db.PingContext(r.Context()); err != nil {
        http.Error(w, "Not ready", http.StatusServiceUnavailable)
        return
    }

    w.WriteHeader(http.StatusOK)
    w.Write([]byte("OK"))
}

Business Logic with Resilience

go
// Order represents a domain entity
type Order struct {
    ID        string    `json:"id"`
    UserID    string    `json:"user_id"`
    Items     []Item    `json:"items"`
    Total     float64   `json:"total"`
    Status    string    `json:"status"`
    CreatedAt time.Time `json:"created_at"`
}

type Item struct {
    ProductID string  `json:"product_id"`
    Quantity  int     `json:"quantity"`
    Price     float64 `json:"price"`
}

func (s *Service) handleOrders(w http.ResponseWriter, r *http.Request) {
    ctx := r.Context()

    switch r.Method {
    case http.MethodGet:
        s.listOrders(ctx, w, r)
    case http.MethodPost:
        s.createOrder(ctx, w, r)
    default:
        http.Error(w, "Method not allowed", http.StatusMethodNotAllowed)
    }
}

func (s *Service) createOrder(ctx context.Context, w http.ResponseWriter, r *http.Request) {
    // Parse request
    var req struct {
        UserID string `json:"user_id"`
        Items  []Item `json:"items"`
    }

    if err := json.NewDecoder(r.Body).Decode(&req); err != nil {
        s.writeError(w, "Invalid request body", http.StatusBadRequest)
        return
    }

    // Validate
    if req.UserID == "" || len(req.Items) == 0 {
        s.writeError(w, "User ID and items are required", http.StatusBadRequest)
        return
    }

    // Apply bulkhead
    err := s.bulkhead.Execute(ctx, func() error {
        return s.processOrder(ctx, req.UserID, req.Items)
    })

    if errors.Is(err, ErrBulkheadFull) {
        s.writeError(w, "Service busy, please retry", http.StatusServiceUnavailable)
        return
    }

    if err != nil {
        s.logger.Error("order creation failed", "error", err)
        s.writeError(w, "Order creation failed", http.StatusInternalServerError)
        return
    }

    w.WriteHeader(http.StatusCreated)
}

func (s *Service) processOrder(ctx context.Context, userID string, items []Item) error {
    // Start transaction
    tx, err := s.db.BeginTx(ctx, nil)
    if err != nil {
        return fmt.Errorf("begin transaction: %w", err)
    }
    defer tx.Rollback()

    // Calculate total
    var total float64
    for _, item := range items {
        total += item.Price * float64(item.Quantity)
    }

    // Create order
    orderID := generateOrderID()
    _, err = tx.ExecContext(ctx,
        `INSERT INTO orders (id, user_id, total, status, created_at)
         VALUES ($1, $2, $3, 'pending', NOW())`,
        orderID, userID, total,
    )
    if err != nil {
        return fmt.Errorf("insert order: %w", err)
    }

    // Create order items
    for _, item := range items {
        _, err = tx.ExecContext(ctx,
            `INSERT INTO order_items (order_id, product_id, quantity, price)
             VALUES ($1, $2, $3, $4)`,
            orderID, item.ProductID, item.Quantity, item.Price,
        )
        if err != nil {
            return fmt.Errorf("insert order item: %w", err)
        }
    }

    // Reserve inventory (with circuit breaker)
    err = s.circuitBreaker.Execute(func() error {
        return s.reserveInventory(ctx, items)
    })
    if err != nil {
        return fmt.Errorf("reserve inventory: %w", err)
    }

    // Commit transaction
    if err := tx.Commit(); err != nil {
        return fmt.Errorf("commit transaction: %w", err)
    }

    // Send async notification (fire and forget with retry)
    go s.sendOrderNotification(orderID, userID)

    s.metrics.IncrementCounter("orders_created")
    return nil
}

func (s *Service) reserveInventory(ctx context.Context, items []Item) error {
    // Call inventory service with retry
    var lastErr error

    for attempt := 0; attempt < 3; attempt++ {
        req, err := http.NewRequestWithContext(ctx, "POST",
            "http://inventory-service/reserve", nil)
        if err != nil {
            return err
        }

        resp, err := http.DefaultClient.Do(req)
        if err != nil {
            lastErr = err
            time.Sleep(100 * time.Millisecond * time.Duration(1<<attempt))
            continue
        }
        defer resp.Body.Close()

        if resp.StatusCode == http.StatusOK {
            return nil
        }

        lastErr = fmt.Errorf("inventory service returned %d", resp.StatusCode)
    }

    return lastErr
}

func (s *Service) sendOrderNotification(orderID, userID string) {
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    // Retry with backoff
    for attempt := 0; attempt < 5; attempt++ {
        err := s.sendNotification(ctx, orderID, userID)
        if err == nil {
            return
        }

        s.logger.Warn("notification failed, retrying",
            "order_id", orderID,
            "attempt", attempt,
            "error", err,
        )

        time.Sleep(time.Second * time.Duration(1<<attempt))
    }

    s.logger.Error("notification permanently failed", "order_id", orderID)
    s.metrics.IncrementCounter("notification_failures")
}

func (s *Service) sendNotification(ctx context.Context, orderID, userID string) error {
    // Implementation
    return nil
}

func (s *Service) writeError(w http.ResponseWriter, message string, code int) {
    w.Header().Set("Content-Type", "application/json")
    w.WriteHeader(code)
    json.NewEncoder(w).Encode(map[string]string{
        "error": message,
    })
}

Configuration Management

go
// LoadConfig loads configuration from environment
func LoadConfig() (*Config, error) {
    cfg := &Config{}

    // Use reflection or a library like envconfig
    cfg.ServerPort = getEnvInt("SERVER_PORT", 8080)
    cfg.ServerReadTimeout = getEnvDuration("SERVER_READ_TIMEOUT", 5*time.Second)
    cfg.ServerWriteTimeout = getEnvDuration("SERVER_WRITE_TIMEOUT", 10*time.Second)
    cfg.ShutdownTimeout = getEnvDuration("SHUTDOWN_TIMEOUT", 30*time.Second)

    cfg.DatabaseURL = os.Getenv("DATABASE_URL")
    if cfg.DatabaseURL == "" {
        return nil, errors.New("DATABASE_URL is required")
    }

    cfg.DatabaseMaxConns = getEnvInt("DATABASE_MAX_CONNS", 25)
    cfg.DatabaseMaxIdle = getEnvInt("DATABASE_MAX_IDLE", 10)
    cfg.DatabaseConnLife = getEnvDuration("DATABASE_CONN_LIFE", 5*time.Minute)

    cfg.CircuitBreakerThreshold = getEnvInt("CB_THRESHOLD", 5)
    cfg.CircuitBreakerTimeout = getEnvDuration("CB_TIMEOUT", 30*time.Second)
    cfg.RateLimitRPS = getEnvInt("RATE_LIMIT_RPS", 100)
    cfg.BulkheadSize = getEnvInt("BULKHEAD_SIZE", 50)

    cfg.TracingEndpoint = os.Getenv("TRACING_ENDPOINT")
    cfg.TracingSampleRate = getEnvFloat("TRACING_SAMPLE_RATE", 0.1)
    cfg.MetricsPort = getEnvInt("METRICS_PORT", 9090)

    cfg.EnableNewFeature = getEnvBool("ENABLE_NEW_FEATURE", false)

    return cfg, nil
}

func getEnvInt(key string, defaultVal int) int {
    if val := os.Getenv(key); val != "" {
        if i, err := strconv.Atoi(val); err == nil {
            return i
        }
    }
    return defaultVal
}

func getEnvDuration(key string, defaultVal time.Duration) time.Duration {
    if val := os.Getenv(key); val != "" {
        if d, err := time.ParseDuration(val); err == nil {
            return d
        }
    }
    return defaultVal
}

func getEnvFloat(key string, defaultVal float64) float64 {
    if val := os.Getenv(key); val != "" {
        if f, err := strconv.ParseFloat(val, 64); err == nil {
            return f
        }
    }
    return defaultVal
}

func getEnvBool(key string, defaultVal bool) bool {
    if val := os.Getenv(key); val != "" {
        if b, err := strconv.ParseBool(val); err == nil {
            return b
        }
    }
    return defaultVal
}

Testing

go
// Integration tests
func TestService_Integration(t *testing.T) {
    if testing.Short() {
        t.Skip("skipping integration test")
    }

    // Setup test database
    db := setupTestDB(t)
    defer db.Close()

    // Create service
    cfg := &Config{
        ServerPort:     8081,
        DatabaseURL:    testDatabaseURL,
        // ... other config
    }

    svc, err := NewService(cfg)
    require.NoError(t, err)

    // Start service
    go svc.Start()
    defer svc.Shutdown()

    // Wait for service to be ready
    waitForReady(t, "http://localhost:8081/ready")

    t.Run("create order", func(t *testing.T) {
        body := `{"user_id": "user-1", "items": [{"product_id": "prod-1", "quantity": 2, "price": 10.00}]}`

        resp, err := http.Post("http://localhost:8081/api/v1/orders",
            "application/json",
            strings.NewReader(body))
        require.NoError(t, err)
        defer resp.Body.Close()

        assert.Equal(t, http.StatusCreated, resp.StatusCode)
    })

    t.Run("health check", func(t *testing.T) {
        resp, err := http.Get("http://localhost:8081/health")
        require.NoError(t, err)
        defer resp.Body.Close()

        assert.Equal(t, http.StatusOK, resp.StatusCode)

        var status HealthStatus
        json.NewDecoder(resp.Body).Decode(&status)
        assert.Equal(t, "healthy", status.Status)
    })
}

func waitForReady(t *testing.T, url string) {
    client := &http.Client{Timeout: time.Second}

    for i := 0; i < 30; i++ {
        resp, err := client.Get(url)
        if err == nil && resp.StatusCode == http.StatusOK {
            resp.Body.Close()
            return
        }
        time.Sleep(100 * time.Millisecond)
    }

    t.Fatal("service not ready")
}

// Load tests
func BenchmarkCreateOrder(b *testing.B) {
    // Setup
    svc := setupBenchmarkService(b)
    defer svc.Shutdown()

    body := `{"user_id": "user-1", "items": [{"product_id": "prod-1", "quantity": 2, "price": 10.00}]}`

    b.ResetTimer()
    b.RunParallel(func(pb *testing.PB) {
        for pb.Next() {
            resp, err := http.Post("http://localhost:8081/api/v1/orders",
                "application/json",
                strings.NewReader(body))
            if err != nil {
                b.Error(err)
                continue
            }
            resp.Body.Close()
        }
    })
}

// Chaos testing
func TestService_ChaosCircuitBreaker(t *testing.T) {
    // Setup with mock inventory service that fails
    mockInventory := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
        time.Sleep(100 * time.Millisecond) // Slow response
        http.Error(w, "Service unavailable", http.StatusServiceUnavailable)
    }))
    defer mockInventory.Close()

    // Configure service to use mock
    os.Setenv("INVENTORY_SERVICE_URL", mockInventory.URL)

    svc := setupTestService(t)
    defer svc.Shutdown()

    // Make requests until circuit opens
    for i := 0; i < 10; i++ {
        resp, _ := http.Post("http://localhost:8081/api/v1/orders",
            "application/json",
            strings.NewReader(`{"user_id": "user-1", "items": [...]}`))
        resp.Body.Close()
    }

    // Verify circuit is open
    stats := svc.circuitBreaker.Stats()
    assert.Equal(t, "open", stats.State)
}

Main Entry Point

go
func main() {
    // Load configuration
    cfg, err := LoadConfig()
    if err != nil {
        log.Fatalf("Failed to load config: %v", err)
    }

    // Create service
    svc, err := NewService(cfg)
    if err != nil {
        log.Fatalf("Failed to create service: %v", err)
    }

    // Start service
    if err := svc.Start(); err != nil {
        log.Fatalf("Service failed: %v", err)
    }
}

Production Deployment Checklist

Before Deployment

All tests pass (unit, integration, load)
Configuration validated for target environment
Database migrations applied
Secrets securely stored
Resource limits configured
Health check endpoints working

Observability

Logging configured and shipping to aggregator
Metrics exposed and scraped
Tracing enabled with appropriate sampling
Alerts configured for key metrics
Dashboards created

Resilience

Circuit breakers configured for all external calls
Rate limiting in place
Bulkheads for resource isolation
Retry policies with backoff
Timeouts on all network calls
Graceful shutdown implemented

Security

Operations

Course Summary

Over these 30 parts, we've covered:

Foundations: Distributed systems basics, CAP theorem, consistency models
Communication: RPC, message queues, event streaming
Data: Replication, partitioning, distributed databases
Coordination: Consensus, leader election, distributed locks
Resilience: Circuit breakers, retries, bulkheads, rate limiting
Observability: Logging, metrics, distributed tracing
Architecture: Microservices, service mesh, load balancing
Production: Putting it all together

The key insight: distributed systems are about managing complexity and embracing failure as normal. Build systems that are observable, resilient, and operable.

"The difference between a working system and a production-ready system is everything that happens when things go wrong. Plan for failure, design for recovery, and always be able to see what's happening."

Thank You!

Thank you for joining this distributed systems journey. The field continues to evolve, but the fundamentals we've covered will serve you well. Keep learning, keep building, and remember: every outage is a learning opportunity.

Happy building!