System Design Part 2: Distributed Systems Deep Dive

A Complete Guide for Senior Engineers

7. Design a Distributed Rate Limiter in Go

The Challenge

Rate limiting at scale requires:

Distributed state (limits across multiple servers)
Low latency (< 1ms decision time)
Accuracy (prevent gaming by smart attackers)
Fairness (don't punish legitimate users)

Rate Limiting Algorithms

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    RATE LIMITING ALGORITHMS                                     │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   1. TOKEN BUCKET                                                               │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   ┌─────────────────┐                                                   │  │
│   │   │     BUCKET      │  Capacity: 100 tokens                             │  │
│   │   │   ○ ○ ○ ○ ○     │  Refill rate: 10 tokens/sec                       │  │
│   │   │   ○ ○ ○ ○ ○     │                                                   │  │
│   │   │   ○ ○ ○         │  Current: 13 tokens                               │  │
│   │   └────────┬────────┘                                                   │  │
│   │            │                                                             │  │
│   │            ▼                                                             │  │
│   │   Request arrives → Take 1 token → Allow                                │  │
│   │   No tokens? → Reject (429 Too Many Requests)                           │  │
│   │                                                                          │  │
│   │   PROS: Allows burst, smooth rate                                       │  │
│   │   CONS: Needs atomic counter                                            │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   2. SLIDING WINDOW LOG                                                         │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Time ──────────────────────────────────────────────────▶              │  │
│   │   │                                                       │              │  │
│   │   │  ×  ×    ×   ×  ×   ×     ×  ×                       │              │  │
│   │   │                                                       │              │  │
│   │   └─────────────────── 1 minute window ───────────────────┘              │  │
│   │                                                                          │  │
│   │   Count requests in last minute: 8                                      │  │
│   │   Limit: 10/min → Allow                                                  │  │
│   │                                                                          │  │
│   │   PROS: Accurate, no boundary issues                                    │  │
│   │   CONS: Memory intensive (stores all timestamps)                        │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   3. SLIDING WINDOW COUNTER (Best for distributed)                             │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Previous window: 8 requests (40% weight)                              │  │
│   │   Current window:  4 requests (60% weight)                              │  │
│   │                                                                          │  │
│   │   Weighted count = 8 × 0.4 + 4 × 0.6 = 5.6                              │  │
│   │   Limit: 10/min → Allow                                                  │  │
│   │                                                                          │  │
│   │   PROS: Memory efficient, smooth rate                                   │  │
│   │   CONS: Approximation (not exact)                                       │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Distributed Rate Limiter Architecture

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    DISTRIBUTED RATE LIMITER                                     │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                         API SERVERS                                      │  │
│   │                                                                          │  │
│   │   ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐      │  │
│   │   │ Server1 │  │ Server2 │  │ Server3 │  │ Server4 │  │ Server5 │      │  │
│   │   │         │  │         │  │         │  │         │  │         │      │  │
│   │   │ [Local  │  │ [Local  │  │ [Local  │  │ [Local  │  │ [Local  │      │  │
│   │   │  Cache] │  │  Cache] │  │  Cache] │  │  Cache] │  │  Cache] │      │  │
│   │   └────┬────┘  └────┬────┘  └────┬────┘  └────┬────┘  └────┬────┘      │  │
│   │        │            │            │            │            │            │  │
│   └────────┼────────────┼────────────┼────────────┼────────────┼────────────┘  │
│            │            │            │            │            │                │
│            └────────────┴────────────┼────────────┴────────────┘                │
│                                      │                                          │
│                                      ▼                                          │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                         REDIS CLUSTER                                    │  │
│   │                                                                          │  │
│   │   Key: rate_limit:{user_id}:{window}                                    │  │
│   │   Value: request_count                                                   │  │
│   │   TTL: window_size                                                       │  │
│   │                                                                          │  │
│   │   ┌─────────────────────────────────────────────────────────────────┐   │  │
│   │   │  rate_limit:user123:1708300800 = 45  (TTL: 60s)                 │   │  │
│   │   │  rate_limit:user456:1708300800 = 12  (TTL: 60s)                 │   │  │
│   │   │  rate_limit:user789:1708300800 = 98  (TTL: 60s)                 │   │  │
│   │   └─────────────────────────────────────────────────────────────────┘   │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Go Implementation

go
package ratelimiter

import (
    "context"
    "fmt"
    "time"

    "github.com/go-redis/redis/v8"
)

type RateLimiter struct {
    redis       *redis.Client
    limit       int           // requests per window
    window      time.Duration // window size
    localCache  *LocalCache   // optional local cache for hot keys
}

type RateLimitResult struct {
    Allowed    bool
    Remaining  int
    ResetAt    time.Time
    RetryAfter time.Duration
}

// Sliding window counter implementation
func (r *RateLimiter) Allow(ctx context.Context, key string) (*RateLimitResult, error) {
    now := time.Now()
    currentWindow := now.Truncate(r.window).Unix()
    previousWindow := currentWindow - int64(r.window.Seconds())

    // Keys for current and previous windows
    currentKey := fmt.Sprintf("rate:%s:%d", key, currentWindow)
    previousKey := fmt.Sprintf("rate:%s:%d", key, previousWindow)

    // Lua script for atomic sliding window calculation
    script := redis.NewScript(`
        local current_key = KEYS[1]
        local previous_key = KEYS[2]
        local limit = tonumber(ARGV[1])
        local window = tonumber(ARGV[2])
        local now = tonumber(ARGV[3])
        local window_start = tonumber(ARGV[4])

        -- Get counts
        local current_count = tonumber(redis.call('GET', current_key) or '0')
        local previous_count = tonumber(redis.call('GET', previous_key) or '0')

        -- Calculate weight (how far into current window)
        local elapsed = now - window_start
        local weight = elapsed / window

        -- Weighted count (sliding window approximation)
        local weighted_count = previous_count * (1 - weight) + current_count

        if weighted_count >= limit then
            -- Rate limited
            return {0, math.ceil(limit - weighted_count), 0}
        end

        -- Increment current window
        redis.call('INCR', current_key)
        redis.call('EXPIRE', current_key, window * 2)

        local remaining = math.floor(limit - weighted_count - 1)
        return {1, remaining, 1}
    `)

    result, err := script.Run(ctx, r.redis, []string{currentKey, previousKey},
        r.limit,
        int64(r.window.Seconds()),
        now.Unix(),
        currentWindow,
    ).Slice()

    if err != nil {
        // Redis error - fail open (allow request)
        return &RateLimitResult{Allowed: true, Remaining: 0}, nil
    }

    allowed := result[0].(int64) == 1
    remaining := int(result[1].(int64))

    resetAt := time.Unix(currentWindow, 0).Add(r.window)
    var retryAfter time.Duration
    if !allowed {
        retryAfter = time.Until(resetAt)
    }

    return &RateLimitResult{
        Allowed:    allowed,
        Remaining:  remaining,
        ResetAt:    resetAt,
        RetryAfter: retryAfter,
    }, nil
}

// Token bucket implementation (alternative)
type TokenBucketLimiter struct {
    redis      *redis.Client
    capacity   int           // max tokens
    refillRate int           // tokens per second
}

func (t *TokenBucketLimiter) Allow(ctx context.Context, key string) (bool, error) {
    script := redis.NewScript(`
        local key = KEYS[1]
        local capacity = tonumber(ARGV[1])
        local refill_rate = tonumber(ARGV[2])
        local now = tonumber(ARGV[3])
        local requested = tonumber(ARGV[4])

        local bucket = redis.call('HMGET', key, 'tokens', 'last_refill')
        local tokens = tonumber(bucket[1]) or capacity
        local last_refill = tonumber(bucket[2]) or now

        -- Calculate tokens to add
        local elapsed = now - last_refill
        local refill = elapsed * refill_rate
        tokens = math.min(capacity, tokens + refill)

        if tokens >= requested then
            tokens = tokens - requested
            redis.call('HMSET', key, 'tokens', tokens, 'last_refill', now)
            redis.call('EXPIRE', key, capacity / refill_rate * 2)
            return 1
        end

        return 0
    `)

    result, err := script.Run(ctx, t.redis, []string{"bucket:" + key},
        t.capacity,
        t.refillRate,
        time.Now().Unix(),
        1, // 1 token requested
    ).Int()

    if err != nil {
        return true, nil // Fail open
    }

    return result == 1, nil
}

Handling Scale

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    SCALING RATE LIMITER                                         │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   CHALLENGE: 1 million requests/second, each needs rate limit check            │
│                                                                                 │
│   SOLUTION 1: LOCAL CACHE + ASYNC SYNC                                          │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Server maintains local counter:                                        │  │
│   │   - Increment locally for each request                                  │  │
│   │   - Sync to Redis every 100ms                                           │  │
│   │   - Pull global count every 100ms                                       │  │
│   │                                                                          │  │
│   │   Trade-off: Slightly inaccurate (up to 10% over limit possible)        │  │
│   │   Benefit: 10x less Redis calls                                          │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   SOLUTION 2: SHARDED RATE LIMITING                                            │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Limit: 1000 req/min per user                                          │  │
│   │   Servers: 10                                                            │  │
│   │   Per-server limit: 100 req/min                                         │  │
│   │                                                                          │  │
│   │   Server 1: Can allow 100 local requests before Redis check             │  │
│   │   After 100: Must check global counter                                  │  │
│   │                                                                          │  │
│   │   Trade-off: Uneven distribution can cause early blocking               │  │
│   │   Benefit: Mostly local operations                                       │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   SOLUTION 3: CONSISTENT HASHING                                               │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Route user to same server (sticky session)                            │  │
│   │   Server owns that user's rate limit                                    │  │
│   │   No distributed coordination needed!                                    │  │
│   │                                                                          │  │
│   │   Trade-off: Server failure requires re-routing                         │  │
│   │   Benefit: Zero Redis calls for rate limiting                           │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

go
// Local cache with async sync
type HybridRateLimiter struct {
    redis      *redis.Client
    localCache sync.Map // key -> *localCounter
    limit      int
    window     time.Duration
    syncPeriod time.Duration
}

type localCounter struct {
    count     int64
    syncedAt  time.Time
    globalCount int64
}

func (h *HybridRateLimiter) Allow(ctx context.Context, key string) bool {
    // Get or create local counter
    val, _ := h.localCache.LoadOrStore(key, &localCounter{})
    counter := val.(*localCounter)

    // Increment local
    newCount := atomic.AddInt64(&counter.count, 1)

    // Check if we need to sync
    if time.Since(counter.syncedAt) > h.syncPeriod {
        go h.syncWithRedis(ctx, key, counter)
    }

    // Estimate global count
    estimatedGlobal := counter.globalCount + newCount
    return estimatedGlobal <= int64(h.limit)
}

func (h *HybridRateLimiter) syncWithRedis(ctx context.Context, key string, counter *localCounter) {
    // Add local count to Redis
    localCount := atomic.SwapInt64(&counter.count, 0)

    globalCount, _ := h.redis.IncrBy(ctx, "rate:"+key, localCount).Result()

    counter.globalCount = globalCount
    counter.syncedAt = time.Now()
}

8. Design a Notification System (Push + Email + SMS)

The Challenge

A notification system must:

Handle millions of notifications per hour
Support multiple channels (push, email, SMS)
Ensure delivery (retries, fallbacks)
Respect user preferences (opt-out, quiet hours)
Be cost-efficient (batch where possible)

Architecture

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    NOTIFICATION SYSTEM ARCHITECTURE                             │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                         NOTIFICATION SOURCES                             │  │
│   │                                                                          │  │
│   │   ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐      │  │
│   │   │ Order   │  │ Payment │  │ Delivery│  │ Promo   │  │ System  │      │  │
│   │   │ Service │  │ Service │  │ Service │  │ Engine  │  │ Alerts  │      │  │
│   │   └────┬────┘  └────┬────┘  └────┬────┘  └────┬────┘  └────┬────┘      │  │
│   │        │            │            │            │            │            │  │
│   └────────┼────────────┼────────────┼────────────┼────────────┼────────────┘  │
│            │            │            │            │            │                │
│            └────────────┴────────────┼────────────┴────────────┘                │
│                                      │                                          │
│                                      ▼                                          │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                    NOTIFICATION SERVICE                                  │  │
│   │                                                                          │  │
│   │   ┌─────────────────────────────────────────────────────────────────┐   │  │
│   │   │                    API / Event Handler                           │   │  │
│   │   │   • Accepts notification requests                               │   │  │
│   │   │   • Validates payload                                           │   │  │
│   │   │   • Enriches with user preferences                              │   │  │
│   │   └─────────────────────────────────────────────────────────────────┘   │  │
│   │                              │                                           │  │
│   │                              ▼                                           │  │
│   │   ┌─────────────────────────────────────────────────────────────────┐   │  │
│   │   │                    PREFERENCE ENGINE                             │   │  │
│   │   │   • Check user opt-outs                                         │   │  │
│   │   │   • Check quiet hours                                           │   │  │
│   │   │   • Determine channels (push, email, SMS)                       │   │  │
│   │   │   • Apply rate limits per user                                  │   │  │
│   │   └─────────────────────────────────────────────────────────────────┘   │  │
│   │                              │                                           │  │
│   │                              ▼                                           │  │
│   │   ┌─────────────────────────────────────────────────────────────────┐   │  │
│   │   │                    TEMPLATE ENGINE                               │   │  │
│   │   │   • Render notification content                                 │   │  │
│   │   │   • Localize (language, currency)                               │   │  │
│   │   │   • Personalize (name, order details)                           │   │  │
│   │   └─────────────────────────────────────────────────────────────────┘   │  │
│   │                              │                                           │  │
│   │                              ▼                                           │  │
│   │   ┌─────────────────────────────────────────────────────────────────┐   │  │
│   │   │                    KAFKA TOPICS                                  │   │  │
│   │   │                                                                  │   │  │
│   │   │   ┌─────────────┐  ┌─────────────┐  ┌─────────────┐            │   │  │
│   │   │   │notifications│  │notifications│  │notifications│            │   │  │
│   │   │   │   .push     │  │   .email    │  │    .sms     │            │   │  │
│   │   │   └─────────────┘  └─────────────┘  └─────────────┘            │   │  │
│   │   │                                                                  │   │  │
│   │   └─────────────────────────────────────────────────────────────────┘   │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                      │                                          │
│          ┌───────────────────────────┼───────────────────────────┐             │
│          │                           │                           │             │
│          ▼                           ▼                           ▼             │
│   ┌────────────────┐      ┌────────────────┐      ┌────────────────┐          │
│   │  PUSH WORKER   │      │  EMAIL WORKER  │      │   SMS WORKER   │          │
│   │                │      │                │      │                │          │
│   │ • FCM (Android)│      │ • SES (bulk)   │      │ • Twilio       │          │
│   │ • APNS (iOS)   │      │ • SendGrid     │      │ • MSG91        │          │
│   │ • Web Push     │      │ • Mailgun      │      │ • Kaleyra      │          │
│   │                │      │                │      │                │          │
│   │ Rate: 100K/sec │      │ Rate: 10K/sec  │      │ Rate: 1K/sec   │          │
│   └────────────────┘      └────────────────┘      └────────────────┘          │
│          │                           │                           │             │
│          └───────────────────────────┼───────────────────────────┘             │
│                                      │                                          │
│                                      ▼                                          │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                    DELIVERY TRACKING                                     │  │
│   │                                                                          │  │
│   │   • Track delivery status (sent, delivered, failed)                     │  │
│   │   • Store in TimescaleDB for analytics                                  │  │
│   │   • Retry failed notifications                                           │  │
│   │   • Dead letter queue for permanent failures                            │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Notification Flow

go
// Notification service
type NotificationService struct {
    preferenceStore PreferenceStore
    templateEngine  TemplateEngine
    kafkaProducer   *kafka.Producer
    rateLimiter     RateLimiter
}

type NotificationRequest struct {
    UserID       string
    Type         string // ORDER_CONFIRMED, PAYMENT_RECEIVED, etc.
    Channels     []string // PUSH, EMAIL, SMS (optional, auto-detect if empty)
    Data         map[string]interface{}
    Priority     string // HIGH, NORMAL, LOW
    ScheduledAt  *time.Time
}

func (s *NotificationService) Send(ctx context.Context, req *NotificationRequest) error {
    // 1. Get user preferences
    prefs, err := s.preferenceStore.GetPreferences(ctx, req.UserID)
    if err != nil {
        return err
    }

    // 2. Check opt-out
    if prefs.OptedOut(req.Type) {
        return nil // Silently skip
    }

    // 3. Check quiet hours
    if prefs.IsQuietHours() && req.Priority != "HIGH" {
        // Schedule for later
        req.ScheduledAt = prefs.NextActiveTime()
    }

    // 4. Determine channels
    channels := req.Channels
    if len(channels) == 0 {
        channels = s.determineChannels(req.Type, prefs)
    }

    // 5. Apply rate limiting
    for _, channel := range channels {
        if !s.rateLimiter.Allow(ctx, req.UserID+":"+channel) {
            channels = removeChannel(channels, channel)
        }
    }

    // 6. Render templates and queue
    for _, channel := range channels {
        template := s.templateEngine.GetTemplate(req.Type, channel)
        content := template.Render(req.Data)

        message := &NotificationMessage{
            ID:          uuid.New().String(),
            UserID:      req.UserID,
            Channel:     channel,
            Content:     content,
            Priority:    req.Priority,
            ScheduledAt: req.ScheduledAt,
            CreatedAt:   time.Now(),
        }

        topic := "notifications." + strings.ToLower(channel)
        s.kafkaProducer.Produce(&kafka.Message{
            Topic: topic,
            Key:   []byte(req.UserID),
            Value: message.Marshal(),
        })
    }

    return nil
}

// Channel-specific workers
type PushWorker struct {
    fcmClient  *fcm.Client
    apnsClient *apns.Client
    db         *sql.DB
}

func (w *PushWorker) Process(ctx context.Context, msg *NotificationMessage) error {
    // Get user's device tokens
    tokens, err := w.getDeviceTokens(ctx, msg.UserID)
    if err != nil {
        return err
    }

    var lastErr error
    for _, token := range tokens {
        var err error
        switch token.Platform {
        case "android":
            err = w.sendFCM(ctx, token.Token, msg)
        case "ios":
            err = w.sendAPNS(ctx, token.Token, msg)
        case "web":
            err = w.sendWebPush(ctx, token.Token, msg)
        }

        if err != nil {
            lastErr = err
            // Check if token is invalid
            if isInvalidToken(err) {
                w.removeToken(ctx, token)
            }
        } else {
            w.recordDelivery(ctx, msg.ID, token.Platform, "DELIVERED")
        }
    }

    return lastErr
}

type EmailWorker struct {
    ses *ses.Client
    batcher *Batcher
}

func (w *EmailWorker) Process(ctx context.Context, msg *NotificationMessage) error {
    // For non-urgent emails, batch them
    if msg.Priority != "HIGH" {
        w.batcher.Add(msg)
        return nil
    }

    // Send immediately for high priority
    return w.sendSingle(ctx, msg)
}

func (w *EmailWorker) sendBatch(ctx context.Context, messages []*NotificationMessage) error {
    // SES supports up to 50 destinations per API call
    for i := 0; i < len(messages); i += 50 {
        batch := messages[i:min(i+50, len(messages))]

        destinations := make([]*ses.BulkEmailDestination, len(batch))
        for j, msg := range batch {
            destinations[j] = &ses.BulkEmailDestination{
                Destination: &ses.Destination{
                    ToAddresses: []*string{aws.String(msg.Content.Email)},
                },
                ReplacementEmailContent: msg.Content.Body,
            }
        }

        _, err := w.ses.SendBulkEmail(ctx, &ses.SendBulkEmailInput{
            BulkEmailEntries: destinations,
            // ... template info
        })

        if err != nil {
            return err
        }
    }

    return nil
}

Handling Scale and Reliability

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    NOTIFICATION SCALING                                         │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   CHALLENGE: 10 million notifications/hour                                      │
│                                                                                 │
│   PUSH (Fastest):                                                               │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │   FCM limit: 500,000 messages/second (essentially unlimited)            │  │
│   │   APNS limit: Depends on certificate type                               │  │
│   │                                                                          │  │
│   │   Strategy:                                                               │  │
│   │   • 20 push workers                                                      │  │
│   │   • Batch device tokens per user                                        │  │
│   │   • Use FCM topics for broadcast notifications                          │  │
│   │   • Maintain WebSocket pools for instant delivery                       │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   EMAIL (Batched):                                                               │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │   SES limit: 14 emails/second (default), 100K/day                       │  │
│   │   With production access: 200/second, unlimited                         │  │
│   │                                                                          │  │
│   │   Strategy:                                                               │  │
│   │   • Batch non-urgent emails (collect for 5 minutes)                     │  │
│   │   • Use SES bulk API (50 emails per call)                               │  │
│   │   • Multiple SES accounts for higher throughput                         │  │
│   │   • SendGrid/Mailgun as fallback                                        │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   SMS (Expensive, Rate-limited):                                                │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │   Twilio limit: 1 message/second per number (can pool numbers)         │  │
│   │   Cost: ₹0.20 - ₹0.50 per SMS                                           │  │
│   │                                                                          │  │
│   │   Strategy:                                                               │  │
│   │   • Reserve SMS for critical only (OTP, delivery, payment)              │  │
│   │   • Pool multiple phone numbers (10 numbers = 10 SMS/sec)               │  │
│   │   • Use DLT registration for templates (India requirement)              │  │
│   │   • Fallback between Twilio, MSG91, Kaleyra                            │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   RETRY STRATEGY:                                                               │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Attempt 1: Immediate                                                   │  │
│   │   Attempt 2: After 1 minute                                             │  │
│   │   Attempt 3: After 5 minutes                                            │  │
│   │   Attempt 4: After 30 minutes                                           │  │
│   │   Attempt 5: After 2 hours                                              │  │
│   │                                                                          │  │
│   │   After 5 failures:                                                      │  │
│   │   • Move to dead letter queue                                           │  │
│   │   • Alert operations                                                     │  │
│   │   • Try alternative channel if critical                                 │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

9. Design a Fraud Detection Pipeline

The Challenge

Fraud detection must be:

Real-time (decide in < 100ms)
Accurate (low false positives, catch real fraud)
Adaptive (learn new fraud patterns)
Explainable (why was this flagged?)

Architecture

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    FRAUD DETECTION PIPELINE                                     │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         TRANSACTION FLOW                                  │ │
│   │                                                                           │ │
│   │   Payment  ──▶  API  ──▶  FRAUD  ──▶  Decision  ──▶  Process             │ │
│   │   Request      Gate      ENGINE      (Allow/     Payment                 │ │
│   │                                       Block)                              │ │
│   │                                                                           │ │
│   │                  ▲                                                        │ │
│   │                  │ < 100ms SLA                                           │ │
│   │                  ▼                                                        │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│                                                                                 │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         FRAUD ENGINE                                      │ │
│   │                                                                           │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │                    LAYER 1: RULES ENGINE                         │    │ │
│   │   │                    (Deterministic, <10ms)                        │    │ │
│   │   │                                                                  │    │ │
│   │   │   Rule 1: Block if transaction > ₹50,000 from new device       │    │ │
│   │   │   Rule 2: Block if > 5 transactions in 1 minute                │    │ │
│   │   │   Rule 3: Block if location != usual location                  │    │ │
│   │   │   Rule 4: Block if card used in multiple cities same day       │    │ │
│   │   │                                                                  │    │ │
│   │   │   Output: BLOCK / PASS_TO_ML / ALLOW                           │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                              │                                            │ │
│   │                              ▼                                            │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │                    LAYER 2: ML SCORING                           │    │ │
│   │   │                    (Probabilistic, <50ms)                        │    │ │
│   │   │                                                                  │    │ │
│   │   │   Features:                                                      │    │ │
│   │   │   • Transaction amount                                          │    │ │
│   │   │   • Time of day                                                 │    │ │
│   │   │   • Device fingerprint                                          │    │ │
│   │   │   • User transaction history                                    │    │ │
│   │   │   • Merchant category                                           │    │ │
│   │   │   • IP geolocation                                              │    │ │
│   │   │   • Typing patterns (if available)                              │    │ │
│   │   │                                                                  │    │ │
│   │   │   Model: Gradient Boosted Trees (XGBoost)                       │    │ │
│   │   │   Output: Fraud probability (0.0 - 1.0)                         │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                              │                                            │ │
│   │                              ▼                                            │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │                    LAYER 3: DECISION ENGINE                      │    │ │
│   │   │                                                                  │    │ │
│   │   │   Score < 0.3:  ALLOW                                           │    │ │
│   │   │   Score 0.3-0.7: CHALLENGE (OTP, additional verification)       │    │ │
│   │   │   Score > 0.7:  BLOCK                                           │    │ │
│   │   │                                                                  │    │ │
│   │   │   High-value transactions: Lower thresholds                     │    │ │
│   │   │   Trusted users: Higher thresholds                              │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                                                                           │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│                                                                                 │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         ASYNC PIPELINE (Post-decision)                   │ │
│   │                                                                           │ │
│   │   Transaction ──▶ Kafka ──▶ Feature Store Update                        │ │
│   │                        ──▶ Pattern Detection                             │ │
│   │                        ──▶ Model Retraining Queue                        │ │
│   │                        ──▶ Case Management (for manual review)           │ │
│   │                                                                           │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Implementation

go
type FraudDetectionService struct {
    rulesEngine   *RulesEngine
    mlModel       *MLModel
    featureStore  FeatureStore
    kafkaProducer *kafka.Producer
}

type FraudCheckRequest struct {
    TransactionID   string
    UserID          string
    Amount          decimal.Decimal
    MerchantID      string
    MerchantCategory string
    DeviceID        string
    IPAddress       string
    Location        *Location
    Timestamp       time.Time
}

type FraudCheckResult struct {
    Decision      string  // ALLOW, CHALLENGE, BLOCK
    Score         float64
    Reasons       []string
    RulesFired    []string
    ResponseTime  time.Duration
}

func (s *FraudDetectionService) Check(ctx context.Context, req *FraudCheckRequest) (*FraudCheckResult, error) {
    start := time.Now()

    // Layer 1: Rules Engine (fast, deterministic)
    rulesResult := s.rulesEngine.Evaluate(ctx, req)

    if rulesResult.Decision == "BLOCK" {
        return &FraudCheckResult{
            Decision:     "BLOCK",
            Score:        1.0,
            Reasons:      rulesResult.Reasons,
            RulesFired:   rulesResult.FiredRules,
            ResponseTime: time.Since(start),
        }, nil
    }

    if rulesResult.Decision == "ALLOW" && req.Amount.LessThan(decimal.NewFromInt(1000)) {
        // Skip ML for low-value, rules-passed transactions
        return &FraudCheckResult{
            Decision:     "ALLOW",
            Score:        0.0,
            ResponseTime: time.Since(start),
        }, nil
    }

    // Layer 2: ML Scoring
    features := s.extractFeatures(ctx, req)
    mlScore := s.mlModel.Predict(features)

    // Layer 3: Decision
    decision := s.makeDecision(req, mlScore, rulesResult)

    result := &FraudCheckResult{
        Decision:     decision,
        Score:        mlScore,
        Reasons:      s.explainScore(features, mlScore),
        RulesFired:   rulesResult.FiredRules,
        ResponseTime: time.Since(start),
    }

    // Async: Update feature store and log
    go s.postProcess(req, result)

    return result, nil
}

func (s *FraudDetectionService) extractFeatures(ctx context.Context, req *FraudCheckRequest) *Features {
    // Get historical data from feature store
    userHistory := s.featureStore.GetUserHistory(ctx, req.UserID)
    deviceHistory := s.featureStore.GetDeviceHistory(ctx, req.DeviceID)

    return &Features{
        // Transaction features
        Amount:              req.Amount.InexactFloat64(),
        HourOfDay:           req.Timestamp.Hour(),
        DayOfWeek:           int(req.Timestamp.Weekday()),
        MerchantCategory:    req.MerchantCategory,

        // User behavior features
        AvgTransactionAmount: userHistory.AvgAmount,
        TransactionCount24h:  userHistory.Count24h,
        TransactionCount1h:   userHistory.Count1h,
        DaysSinceFirstTxn:    userHistory.DaysSinceFirst,
        UniqueDevices30d:     userHistory.UniqueDevices30d,
        UniqueMerchants30d:   userHistory.UniqueMerchants30d,

        // Device features
        DeviceAge:            deviceHistory.AgeDays,
        DeviceTrustScore:     deviceHistory.TrustScore,
        TransactionsOnDevice: deviceHistory.TxnCount,

        // Location features
        DistanceFromUsual:    s.calculateDistance(req.Location, userHistory.UsualLocation),
        IsNewCity:            !userHistory.KnownCities.Contains(req.Location.City),
        VelocityKmh:          s.calculateVelocity(req.Location, userHistory.LastLocation, req.Timestamp, userHistory.LastTxnTime),

        // IP features
        IPCountry:            s.geoIP.GetCountry(req.IPAddress),
        IsVPN:                s.vpnDetector.IsVPN(req.IPAddress),
        IPRiskScore:          s.ipReputation.GetScore(req.IPAddress),
    }
}

func (s *FraudDetectionService) makeDecision(req *FraudCheckRequest, score float64, rules *RulesResult) string {
    // Adjust thresholds based on transaction value
    var blockThreshold, challengeThreshold float64

    switch {
    case req.Amount.GreaterThan(decimal.NewFromInt(100000)):
        // High value: stricter thresholds
        blockThreshold = 0.5
        challengeThreshold = 0.2
    case req.Amount.GreaterThan(decimal.NewFromInt(10000)):
        blockThreshold = 0.6
        challengeThreshold = 0.3
    default:
        blockThreshold = 0.7
        challengeThreshold = 0.4
    }

    // Adjust for user trust level
    if rules.UserTrustLevel == "HIGH" {
        blockThreshold += 0.1
        challengeThreshold += 0.1
    }

    if score >= blockThreshold {
        return "BLOCK"
    } else if score >= challengeThreshold {
        return "CHALLENGE"
    }
    return "ALLOW"
}

Rules Engine

go
type RulesEngine struct {
    rules []Rule
    cache *RulesCache
}

type Rule struct {
    ID          string
    Name        string
    Description string
    Condition   func(ctx context.Context, req *FraudCheckRequest, cache *RulesCache) bool
    Action      string // BLOCK, CHALLENGE, FLAG
    Priority    int
}

func NewRulesEngine() *RulesEngine {
    return &RulesEngine{
        rules: []Rule{
            {
                ID:   "R001",
                Name: "High Value New Device",
                Condition: func(ctx context.Context, req *FraudCheckRequest, cache *RulesCache) bool {
                    deviceAge := cache.GetDeviceAge(req.DeviceID)
                    return req.Amount.GreaterThan(decimal.NewFromInt(50000)) && deviceAge < 24*time.Hour
                },
                Action:   "BLOCK",
                Priority: 1,
            },
            {
                ID:   "R002",
                Name: "Velocity Check",
                Condition: func(ctx context.Context, req *FraudCheckRequest, cache *RulesCache) bool {
                    count1min := cache.GetTransactionCount(req.UserID, 1*time.Minute)
                    return count1min > 5
                },
                Action:   "BLOCK",
                Priority: 1,
            },
            {
                ID:   "R003",
                Name: "Impossible Travel",
                Condition: func(ctx context.Context, req *FraudCheckRequest, cache *RulesCache) bool {
                    lastTxn := cache.GetLastTransaction(req.UserID)
                    if lastTxn == nil {
                        return false
                    }
                    distance := haversine(req.Location, lastTxn.Location)
                    timeDiff := req.Timestamp.Sub(lastTxn.Timestamp)
                    speedKmh := distance / timeDiff.Hours()
                    return speedKmh > 1000 // Faster than commercial flight
                },
                Action:   "BLOCK",
                Priority: 1,
            },
            {
                ID:   "R004",
                Name: "Unusual Location",
                Condition: func(ctx context.Context, req *FraudCheckRequest, cache *RulesCache) bool {
                    usualLocations := cache.GetUsualLocations(req.UserID)
                    return !usualLocations.Contains(req.Location.City)
                },
                Action:   "CHALLENGE",
                Priority: 2,
            },
            {
                ID:   "R005",
                Name: "VPN Detected",
                Condition: func(ctx context.Context, req *FraudCheckRequest, cache *RulesCache) bool {
                    return cache.IsVPN(req.IPAddress)
                },
                Action:   "CHALLENGE",
                Priority: 3,
            },
        },
    }
}

func (e *RulesEngine) Evaluate(ctx context.Context, req *FraudCheckRequest) *RulesResult {
    result := &RulesResult{
        Decision:   "PASS_TO_ML",
        FiredRules: []string{},
        Reasons:    []string{},
    }

    // Sort rules by priority
    sort.Slice(e.rules, func(i, j int) bool {
        return e.rules[i].Priority < e.rules[j].Priority
    })

    for _, rule := range e.rules {
        if rule.Condition(ctx, req, e.cache) {
            result.FiredRules = append(result.FiredRules, rule.ID)
            result.Reasons = append(result.Reasons, rule.Description)

            if rule.Action == "BLOCK" {
                result.Decision = "BLOCK"
                return result // Stop on first BLOCK
            }
        }
    }

    return result
}

10. Design a Transaction Reconciliation System

The Challenge

Banks must reconcile millions of transactions daily:

Match internal records with external partners (payment networks, other banks)
Identify discrepancies (missing, duplicate, amount mismatch)
Generate reports for auditing
Process by end of day (T+0 or T+1)

Architecture

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    RECONCILIATION SYSTEM ARCHITECTURE                           │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         DATA SOURCES                                      │ │
│   │                                                                           │ │
│   │   ┌─────────────┐  ┌─────────────┐  ┌─────────────┐  ┌─────────────┐    │ │
│   │   │  Internal   │  │   NPCI      │  │   VISA      │  │ Partner     │    │ │
│   │   │  Ledger     │  │   File      │  │   File      │  │  Bank       │    │ │
│   │   │  (Real-time)│  │  (Daily)    │  │  (Daily)    │  │  (Daily)    │    │ │
│   │   └──────┬──────┘  └──────┬──────┘  └──────┬──────┘  └──────┬──────┘    │ │
│   │          │                │                │                │            │ │
│   └──────────┼────────────────┼────────────────┼────────────────┼────────────┘ │
│              │                │                │                │              │
│              ▼                ▼                ▼                ▼              │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         INGESTION LAYER                                   │ │
│   │                                                                           │ │
│   │   • Parse different file formats (CSV, XML, ISO20022)                    │ │
│   │   • Validate checksums                                                    │ │
│   │   • Normalize to common schema                                           │ │
│   │   • Load into staging tables                                              │ │
│   │                                                                           │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│              │                                                                  │
│              ▼                                                                  │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         MATCHING ENGINE                                   │ │
│   │                                                                           │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │                    EXACT MATCH (First Pass)                      │    │ │
│   │   │                                                                  │    │ │
│   │   │   Match on: transaction_id, amount, date                        │    │ │
│   │   │   Result: ~95% matched                                           │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                              │                                            │ │
│   │                    Unmatched ▼                                            │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │                    FUZZY MATCH (Second Pass)                     │    │ │
│   │   │                                                                  │    │ │
│   │   │   Match on: amount ± tolerance, date ± 1 day, partial txn_id    │    │ │
│   │   │   Result: ~3% additional matched                                │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                              │                                            │ │
│   │                    Unmatched ▼                                            │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │                    MANUAL REVIEW QUEUE                           │    │ │
│   │   │                                                                  │    │ │
│   │   │   ~2% require manual investigation                              │    │ │
│   │   │   Assign to operations team                                      │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                                                                           │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│              │                                                                  │
│              ▼                                                                  │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         EXCEPTION HANDLING                                │ │
│   │                                                                           │ │
│   │   EXCEPTION TYPES:                                                        │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │   MISSING_INTERNAL  : In partner file, not in our records       │    │ │
│   │   │   MISSING_EXTERNAL  : In our records, not in partner file       │    │ │
│   │   │   AMOUNT_MISMATCH   : Same txn, different amounts               │    │ │
│   │   │   DUPLICATE         : Same txn appears multiple times           │    │ │
│   │   │   STATUS_MISMATCH   : Different status (success vs failed)      │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                                                                           │ │
│   │   RESOLUTION:                                                             │ │
│   │   ┌─────────────────────────────────────────────────────────────────┐    │ │
│   │   │   Auto-resolve: Timing differences (retry tomorrow)             │    │ │
│   │   │   Manual resolve: Amount mismatches (needs investigation)       │    │ │
│   │   │   Escalate: Large value discrepancies (> ₹1 lakh)              │    │ │
│   │   └─────────────────────────────────────────────────────────────────┘    │ │
│   │                                                                           │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│              │                                                                  │
│              ▼                                                                  │
│   ┌──────────────────────────────────────────────────────────────────────────┐ │
│   │                         REPORTING                                         │ │
│   │                                                                           │ │
│   │   • Daily reconciliation summary                                         │ │
│   │   • Exception report                                                      │ │
│   │   • Aging report (how long exceptions pending)                           │ │
│   │   • Regulatory reports                                                    │ │
│   │                                                                           │ │
│   └──────────────────────────────────────────────────────────────────────────┘ │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Implementation

go
type ReconciliationService struct {
    db             *sql.DB
    fileParser     FileParser
    matchingEngine *MatchingEngine
    notifier       Notifier
}

type ReconciliationJob struct {
    ID            string
    PartnerID     string
    ReconcileDate time.Time
    Status        string
    StartedAt     time.Time
    CompletedAt   *time.Time
    Stats         *ReconciliationStats
}

type ReconciliationStats struct {
    TotalInternal    int
    TotalExternal    int
    ExactMatched     int
    FuzzyMatched     int
    MissingInternal  int
    MissingExternal  int
    AmountMismatch   int
    Duplicates       int
}

func (s *ReconciliationService) Reconcile(ctx context.Context, partnerID string, date time.Time) (*ReconciliationJob, error) {
    job := &ReconciliationJob{
        ID:            uuid.New().String(),
        PartnerID:     partnerID,
        ReconcileDate: date,
        Status:        "RUNNING",
        StartedAt:     time.Now(),
    }

    // 1. Ingest external file
    externalRecords, err := s.ingestExternalFile(ctx, partnerID, date)
    if err != nil {
        return nil, fmt.Errorf("failed to ingest external file: %w", err)
    }

    // 2. Fetch internal records for the same date
    internalRecords, err := s.fetchInternalRecords(ctx, partnerID, date)
    if err != nil {
        return nil, fmt.Errorf("failed to fetch internal records: %w", err)
    }

    // 3. Run matching
    matchResult := s.matchingEngine.Match(ctx, internalRecords, externalRecords)

    // 4. Store results
    err = s.storeResults(ctx, job.ID, matchResult)
    if err != nil {
        return nil, err
    }

    // 5. Handle exceptions
    s.handleExceptions(ctx, job.ID, matchResult.Exceptions)

    // 6. Update job stats
    job.Stats = &ReconciliationStats{
        TotalInternal:   len(internalRecords),
        TotalExternal:   len(externalRecords),
        ExactMatched:    matchResult.ExactMatched,
        FuzzyMatched:    matchResult.FuzzyMatched,
        MissingInternal: len(matchResult.MissingInternal),
        MissingExternal: len(matchResult.MissingExternal),
        AmountMismatch:  len(matchResult.AmountMismatches),
        Duplicates:      len(matchResult.Duplicates),
    }

    completedAt := time.Now()
    job.CompletedAt = &completedAt
    job.Status = "COMPLETED"

    // 7. Send notifications
    s.notifyStakeholders(ctx, job)

    return job, nil
}

type MatchingEngine struct {
    tolerance decimal.Decimal // Amount tolerance for fuzzy matching
}

type MatchResult struct {
    ExactMatched      int
    FuzzyMatched      int
    MissingInternal   []*Exception // In external, not in internal
    MissingExternal   []*Exception // In internal, not in external
    AmountMismatches  []*Exception
    Duplicates        []*Exception
    Exceptions        []*Exception
}

func (m *MatchingEngine) Match(ctx context.Context, internal, external []*Transaction) *MatchResult {
    result := &MatchResult{}

    // Build indexes for O(1) lookup
    internalIndex := make(map[string]*Transaction)
    internalMatched := make(map[string]bool)

    for _, txn := range internal {
        key := txn.ExternalRefID // The ID that should match partner's record
        internalIndex[key] = txn
    }

    externalMatched := make(map[string]bool)

    // First pass: Exact matching
    for _, extTxn := range external {
        if intTxn, exists := internalIndex[extTxn.TransactionID]; exists {
            // Check amount match
            if intTxn.Amount.Equal(extTxn.Amount) {
                result.ExactMatched++
                internalMatched[intTxn.ID] = true
                externalMatched[extTxn.TransactionID] = true
            } else {
                // Amount mismatch
                result.AmountMismatches = append(result.AmountMismatches, &Exception{
                    Type:            "AMOUNT_MISMATCH",
                    InternalTxnID:   intTxn.ID,
                    ExternalTxnID:   extTxn.TransactionID,
                    InternalAmount:  intTxn.Amount,
                    ExternalAmount:  extTxn.Amount,
                    Difference:      intTxn.Amount.Sub(extTxn.Amount),
                })
                internalMatched[intTxn.ID] = true
                externalMatched[extTxn.TransactionID] = true
            }
        }
    }

    // Second pass: Fuzzy matching for unmatched
    for _, extTxn := range external {
        if externalMatched[extTxn.TransactionID] {
            continue
        }

        // Try fuzzy match
        for _, intTxn := range internal {
            if internalMatched[intTxn.ID] {
                continue
            }

            // Fuzzy criteria: similar amount, same date, similar merchant
            amountDiff := intTxn.Amount.Sub(extTxn.Amount).Abs()
            if amountDiff.LessThanOrEqual(m.tolerance) &&
               intTxn.TransactionDate.Equal(extTxn.TransactionDate) {
                result.FuzzyMatched++
                internalMatched[intTxn.ID] = true
                externalMatched[extTxn.TransactionID] = true

                // Still record for review if amounts differ
                if !amountDiff.IsZero() {
                    result.AmountMismatches = append(result.AmountMismatches, &Exception{
                        Type:           "FUZZY_AMOUNT_DIFF",
                        InternalTxnID:  intTxn.ID,
                        ExternalTxnID:  extTxn.TransactionID,
                        InternalAmount: intTxn.Amount,
                        ExternalAmount: extTxn.Amount,
                        Difference:     amountDiff,
                    })
                }
                break
            }
        }
    }

    // Identify missing records
    for _, extTxn := range external {
        if !externalMatched[extTxn.TransactionID] {
            result.MissingInternal = append(result.MissingInternal, &Exception{
                Type:           "MISSING_INTERNAL",
                ExternalTxnID:  extTxn.TransactionID,
                ExternalAmount: extTxn.Amount,
            })
        }
    }

    for _, intTxn := range internal {
        if !internalMatched[intTxn.ID] {
            result.MissingExternal = append(result.MissingExternal, &Exception{
                Type:           "MISSING_EXTERNAL",
                InternalTxnID:  intTxn.ID,
                InternalAmount: intTxn.Amount,
            })
        }
    }

    // Combine all exceptions
    result.Exceptions = append(result.Exceptions, result.MissingInternal...)
    result.Exceptions = append(result.Exceptions, result.MissingExternal...)
    result.Exceptions = append(result.Exceptions, result.AmountMismatches...)
    result.Exceptions = append(result.Exceptions, result.Duplicates...)

    return result
}

Handling Large Scale Reconciliation

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    SCALING RECONCILIATION                                       │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   CHALLENGE: 10 million transactions/day per partner                           │
│                                                                                 │
│   SOLUTION 1: PARTITIONED MATCHING                                              │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Split by hour:                                                         │  │
│   │   00:00-01:00 → Worker 1                                                │  │
│   │   01:00-02:00 → Worker 2                                                │  │
│   │   ...                                                                    │  │
│   │   23:00-24:00 → Worker 24                                               │  │
│   │                                                                          │  │
│   │   Each worker handles ~400K transactions                                │  │
│   │   Parallel processing: 24 workers                                       │  │
│   │   Total time: 10-15 minutes instead of hours                            │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   SOLUTION 2: DATABASE OPTIMIZATION                                            │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Index strategy:                                                        │  │
│   │   CREATE INDEX idx_recon_lookup ON transactions                         │  │
│   │       (partner_id, external_ref_id, transaction_date);                  │  │
│   │                                                                          │  │
│   │   Partition by date:                                                     │  │
│   │   Each day's data in separate partition                                 │  │
│   │   Old partitions archived/dropped                                       │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
│   SOLUTION 3: BLOOM FILTER PRE-CHECK                                           │
│   ┌─────────────────────────────────────────────────────────────────────────┐  │
│   │                                                                          │  │
│   │   Step 1: Build Bloom filter of internal transaction IDs               │  │
│   │   Step 2: Check external IDs against filter                            │  │
│   │   Step 3: Only do DB lookup for "probably exists"                       │  │
│   │                                                                          │  │
│   │   Benefit: 95% of "not found" cases avoided DB lookup                   │  │
│   │                                                                          │  │
│   └─────────────────────────────────────────────────────────────────────────┘  │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

11. How Do You Prevent Double Spending in Distributed Systems?

This is the most critical question for any payment system.

The Problem

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    DOUBLE SPENDING SCENARIO                                     │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   User balance: ₹100                                                            │
│                                                                                 │
│   Time 0ms: Request 1 arrives at Server A                                       │
│   Time 1ms: Request 2 arrives at Server B (same payment, network retry)        │
│                                                                                 │
│   Server A:                           Server B:                                 │
│   ┌─────────────────────────────┐    ┌─────────────────────────────┐           │
│   │ 1. Check balance: ₹100 ✓   │    │ 1. Check balance: ₹100 ✓   │           │
│   │ 2. Deduct ₹80              │    │ 2. Deduct ₹80              │           │
│   │ 3. Update: ₹20             │    │ 3. Update: ₹20             │           │
│   └─────────────────────────────┘    └─────────────────────────────┘           │
│                                                                                 │
│   RESULT: User paid ₹160 but only had ₹100!                                    │
│   Money was "created" out of thin air.                                         │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Solutions

Solution 1: Idempotency Keys

go
func (s *PaymentService) ProcessPayment(ctx context.Context, req *PaymentRequest) (*PaymentResponse, error) {
    // Idempotency key from client
    idempotencyKey := req.IdempotencyKey
    if idempotencyKey == "" {
        // Generate deterministic key if not provided
        idempotencyKey = sha256(req.UserID + req.Amount + req.MerchantID + req.ClientTxnID)
    }

    // Check if already processed
    existing, err := s.idempotencyStore.Get(ctx, idempotencyKey)
    if err == nil && existing != nil {
        return existing, nil // Return same response
    }

    // Process payment (with database-level protection)
    result, err := s.processWithLock(ctx, idempotencyKey, req)
    if err != nil {
        return nil, err
    }

    // Store result for future duplicate requests
    s.idempotencyStore.Set(ctx, idempotencyKey, result, 24*time.Hour)

    return result, nil
}

Solution 2: Database-Level Uniqueness

sql
-- Create payment with unique constraint
CREATE TABLE payments (
    id              UUID PRIMARY KEY,
    idempotency_key VARCHAR(64) UNIQUE,  -- Prevents duplicates
    user_id         UUID NOT NULL,
    amount          DECIMAL(15,2) NOT NULL,
    status          VARCHAR(20) NOT NULL,
    created_at      TIMESTAMP DEFAULT NOW()
);

-- Insert with conflict handling
INSERT INTO payments (id, idempotency_key, user_id, amount, status)
VALUES ($1, $2, $3, $4, 'PENDING')
ON CONFLICT (idempotency_key) DO NOTHING
RETURNING id;

-- If no row returned, it was a duplicate

Solution 3: Distributed Locking

go
func (s *PaymentService) processWithLock(ctx context.Context, idempotencyKey string, req *PaymentRequest) (*PaymentResponse, error) {
    // Acquire distributed lock on user's account
    lockKey := fmt.Sprintf("payment:lock:%s", req.UserID)

    lock, err := s.redlock.Lock(ctx, lockKey, 30*time.Second)
    if err != nil {
        return nil, ErrPaymentInProgress
    }
    defer lock.Unlock()

    // Double-check idempotency after acquiring lock
    if existing := s.checkIdempotency(ctx, idempotencyKey); existing != nil {
        return existing, nil
    }

    // Now safe to process - we hold exclusive lock
    return s.executePayment(ctx, req)
}

Solution 4: Optimistic Locking with Version


-- Add version column
ALTER TABLE accounts ADD COLUMN version INTEGER DEFAULT 0;

-- Update with version check
UPDATE accounts
SET balance = balance - $1,
    version = version + 1
WHERE id = $2
  AND balance >= $1
  AND version = $3
RETURNING version;

-- If no rows updated:
-- - Either insufficient balance
-- - Or version mismatch (concurrent update)
-- Retry with fresh version

Solution 5: Event Sourcing with Deduplication

go
// Instead of storing balance, store events
type AccountEvent struct {
    EventID     string
    AccountID   string
    EventType   string // CREDIT, DEBIT
    Amount      decimal.Decimal
    Timestamp   time.Time
    IdempotencyKey string
}

func (s *EventStore) AppendEvent(ctx context.Context, event *AccountEvent) error {
    // Insert with idempotency check
    _, err := s.db.ExecContext(ctx, `
        INSERT INTO account_events (event_id, account_id, event_type, amount, idempotency_key)
        VALUES ($1, $2, $3, $4, $5)
        ON CONFLICT (idempotency_key) DO NOTHING
    `, event.EventID, event.AccountID, event.EventType, event.Amount, event.IdempotencyKey)

    return err
}

// Balance is computed from events
func (s *EventStore) GetBalance(ctx context.Context, accountID string) (decimal.Decimal, error) {
    var balance decimal.Decimal
    err := s.db.QueryRowContext(ctx, `
        SELECT COALESCE(
            SUM(CASE WHEN event_type = 'CREDIT' THEN amount ELSE -amount END),
            0
        )
        FROM account_events
        WHERE account_id = $1
    `, accountID).Scan(&balance)

    return balance, err
}

Comparison of Approaches

Approach	Pros	Cons	Best For
Idempotency Keys	Simple, works with any DB	Requires client coordination	APIs with retries
DB Uniqueness	Database guarantees	Single DB only	Single-region systems
Distributed Lock	Works across services	Performance overhead	Cross-service operations
Optimistic Lock	High concurrency	Retries needed	High-read, low-write
Event Sourcing	Full audit trail	Complex queries	Audit-heavy systems

12. How Would You Handle Partial Failure in Payment Processing?

The Scenario

┌─────────────────────────────────────────────────────────────────────────────────┐
│                    PARTIAL FAILURE SCENARIO                                     │
├─────────────────────────────────────────────────────────────────────────────────┤
│                                                                                 │
│   Payment flow:                                                                 │
│   1. Validate request ✓                                                         │
│   2. Reserve inventory ✓                                                        │
│   3. Charge payment ✓                                                           │
│   4. Confirm order ✗ (SERVICE DOWN!)                                           │
│   5. Notify customer (never reached)                                            │
│                                                                                 │
│   STATE:                                                                        │
│   - Customer charged ✓                                                          │
│   - Inventory reserved ✓                                                        │
│   - Order NOT confirmed                                                         │
│   - Customer NOT notified                                                       │
│                                                                                 │
│   QUESTION: What do we do now?                                                 │
│                                                                                 │
└─────────────────────────────────────────────────────────────────────────────────┘

Solution: Saga Pattern with Compensation

go
type OrderSaga struct {
    steps []SagaStep
}

type SagaStep struct {
    Name        string
    Execute     func(ctx context.Context, data *OrderData) error
    Compensate  func(ctx context.Context, data *OrderData) error
}

func NewOrderSaga() *OrderSaga {
    return &OrderSaga{
        steps: []SagaStep{
            {
                Name: "validate",
                Execute: func(ctx context.Context, data *OrderData) error {
                    return validateOrder(ctx, data)
                },
                Compensate: nil, // Nothing to compensate
            },
            {
                Name: "reserve_inventory",
                Execute: func(ctx context.Context, data *OrderData) error {
                    return reserveInventory(ctx, data)
                },
                Compensate: func(ctx context.Context, data *OrderData) error {
                    return releaseInventory(ctx, data)
                },
            },
            {
                Name: "charge_payment",
                Execute: func(ctx context.Context, data *OrderData) error {
                    return chargePayment(ctx, data)
                },
                Compensate: func(ctx context.Context, data *OrderData) error {
                    return refundPayment(ctx, data)
                },
            },
            {
                Name: "confirm_order",
                Execute: func(ctx context.Context, data *OrderData) error {
                    return confirmOrder(ctx, data)
                },
                Compensate: func(ctx context.Context, data *OrderData) error {
                    return cancelOrder(ctx, data)
                },
            },
            {
                Name: "notify_customer",
                Execute: func(ctx context.Context, data *OrderData) error {
                    return notifyCustomer(ctx, data)
                },
                Compensate: nil, // Optional, can skip
            },
        },
    }
}

func (s *OrderSaga) Execute(ctx context.Context, data *OrderData) error {
    completedSteps := []int{}

    for i, step := range s.steps {
        err := step.Execute(ctx, data)
        if err != nil {
            // Step failed - compensate all completed steps in reverse
            log.Printf("Step %s failed: %v. Starting compensation.", step.Name, err)

            for j := len(completedSteps) - 1; j >= 0; j-- {
                stepIndex := completedSteps[j]
                compStep := s.steps[stepIndex]

                if compStep.Compensate != nil {
                    compErr := compStep.Compensate(ctx, data)
                    if compErr != nil {
                        // Compensation failed - this is serious
                        log.Printf("CRITICAL: Compensation for %s failed: %v", compStep.Name, compErr)
                        s.escalateToManualIntervention(ctx, data, stepIndex, compErr)
                    }
                }
            }

            return fmt.Errorf("saga failed at step %s: %w", step.Name, err)
        }

        completedSteps = append(completedSteps, i)
    }

    return nil
}

Persisted Saga for Crash Recovery

go
type SagaState struct {
    SagaID         string
    OrderID        string
    CurrentStep    int
    Status         string // RUNNING, COMPENSATING, COMPLETED, FAILED
    CompletedSteps []string
    FailedAt       *string
    CreatedAt      time.Time
    UpdatedAt      time.Time
}

func (s *SagaOrchestrator) ExecuteWithPersistence(ctx context.Context, sagaID string, data *OrderData) error {
    // Load or create saga state
    state, err := s.loadOrCreateState(ctx, sagaID, data)
    if err != nil {
        return err
    }

    // Resume from where we left off
    for i := state.CurrentStep; i < len(s.saga.steps); i++ {
        step := s.saga.steps[i]

        // Update state before executing
        state.CurrentStep = i
        state.Status = "RUNNING"
        s.saveState(ctx, state)

        // Execute step with timeout
        stepCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
        err := step.Execute(stepCtx, data)
        cancel()

        if err != nil {
            state.FailedAt = &step.Name
            state.Status = "COMPENSATING"
            s.saveState(ctx, state)

            // Start compensation
            s.compensate(ctx, state, data)
            return err
        }

        state.CompletedSteps = append(state.CompletedSteps, step.Name)
        s.saveState(ctx, state)
    }

    state.Status = "COMPLETED"
    s.saveState(ctx, state)
    return nil
}

// Background job picks up incomplete sagas after service restart
func (s *SagaOrchestrator) RecoverIncompleteSagas(ctx context.Context) {
    incompleteSagas, _ := s.db.Query(`
        SELECT * FROM saga_states
        WHERE status IN ('RUNNING', 'COMPENSATING')
        AND updated_at < NOW() - INTERVAL '5 minutes'
    `)

    for _, saga := range incompleteSagas {
        go s.resumeSaga(ctx, saga)
    }
}

This completes Part 2 covering distributed systems patterns. The document continues with more topics in subsequent parts.