Contents

Go Daemon Development: Graceful SIGTERM Handling and Config Hot-Reload

For a NAS product, I needed to develop a 24/7 system daemon for disk health monitoring and scheduled task management. This post covers the key technical aspects: signal handling, graceful shutdown, and config hot-reload.

1. Why Write Your Own Daemon?

1.1 Use Case

The NAS device needs a “caretaker” process:

  • Periodic disk SMART status checks
  • RAID array health monitoring
  • Scheduled backup tasks
  • OOM protection (proactively clear cache when memory is tight)

1.2 Why Not Just Use cron?

cron can’t handle:

  • Needs to stay resident in memory, maintain state
  • Tasks have dependencies
  • Need to react immediately to system events (like disk plug/unplug)
  • Need custom retry logic

So we need a long-running daemon.

2. Basic Skeleton

2.1 Minimal Daemon Structure

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
package main

import (
    "context"
    "log"
    "os"
    "os/signal"
    "syscall"
)

func main() {
    // Create cancellable context
    ctx, cancel := context.WithCancel(context.Background())
    
    // Start business logic
    go runDaemon(ctx)
    
    // Wait for shutdown signal
    waitForShutdown(cancel)
}

func runDaemon(ctx context.Context) {
    ticker := time.NewTicker(time.Minute)
    defer ticker.Stop()
    
    for {
        select {
        case <-ctx.Done():
            log.Println("Received shutdown signal, stopping...")
            return
        case <-ticker.C:
            doPeriodicWork()
        }
    }
}

func waitForShutdown(cancel context.CancelFunc) {
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
    
    sig := <-sigChan
    log.Printf("Received signal %v, starting graceful shutdown...", sig)
    cancel()
    
    // Give business logic time to clean up
    time.Sleep(5 * time.Second)
    log.Println("Shutdown complete")
}

2.2 Signal Handling Deep Dive

Common signals and how to handle them:

SignalTriggerRecommended Action
SIGINT (2)Ctrl+CGraceful shutdown
SIGTERM (15)kill / systemd stopGraceful shutdown
SIGKILL (9)kill -9Can’t be caught!
SIGHUP (1)Terminal disconnect / customHot-reload config
SIGUSR1 (10)User-definedPrint status/dump
SIGUSR2 (12)User-definedToggle log level
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
func setupSignalHandlers(ctx context.Context, cancel context.CancelFunc, configReload chan struct{}) {
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, 
        syscall.SIGINT, 
        syscall.SIGTERM, 
        syscall.SIGHUP,
        syscall.SIGUSR1,
    )
    
    go func() {
        for sig := range sigChan {
            switch sig {
            case syscall.SIGINT, syscall.SIGTERM:
                log.Printf("Received %v, starting graceful shutdown", sig)
                cancel()
                return
            case syscall.SIGHUP:
                log.Println("Received SIGHUP, triggering hot-reload")
                configReload <- struct{}{}
            case syscall.SIGUSR1:
                log.Println("Received SIGUSR1, printing status")
                printDaemonStatus()
            }
        }
    }()
}

3. Graceful Shutdown

3.1 Why Is Graceful Shutdown Necessary?

If a process is killed immediately:

  • Files being written may be corrupted
  • Database connections not closed properly
  • Network requests left hanging, clients timeout
  • Temp files not cleaned up

3.2 Using Context for Cancellation Propagation

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
type DiskMonitor struct {
    ctx    context.Context
    cancel context.CancelFunc
    wg     sync.WaitGroup
}

func NewDiskMonitor(parentCtx context.Context) *DiskMonitor {
    ctx, cancel := context.WithCancel(parentCtx)
    return &DiskMonitor{
        ctx:    ctx,
        cancel: cancel,
    }
}

func (m *DiskMonitor) Start() {
    m.wg.Add(1)
    go func() {
        defer m.wg.Done()
        m.run()
    }()
}

func (m *DiskMonitor) run() {
    ticker := time.NewTicker(5 * time.Minute)
    defer ticker.Stop()
    
    for {
        select {
        case <-m.ctx.Done():
            log.Println("DiskMonitor: received cancel signal")
            m.cleanup()
            return
        case <-ticker.C:
            m.checkAllDisks()
        }
    }
}

func (m *DiskMonitor) cleanup() {
    log.Println("DiskMonitor: cleaning up resources...")
    // Close database connections
    // Flush cache to disk
    // etc...
}

func (m *DiskMonitor) Stop() {
    log.Println("DiskMonitor: stop requested")
    m.cancel()
    m.wg.Wait() // Wait for goroutine to actually exit
    log.Println("DiskMonitor: fully stopped")
}

3.3 Timeout-Forced Shutdown

Graceful shutdown can’t wait forever:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
func gracefulShutdown(cancel context.CancelFunc, components []Stoppable) {
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
    
    <-sigChan
    log.Println("Starting graceful shutdown...")
    
    // Notify all components to stop
    cancel()
    
    // Set timeout
    done := make(chan struct{})
    go func() {
        for _, c := range components {
            c.Stop() // Assuming each component has Stop() method
        }
        close(done)
    }()
    
    select {
    case <-done:
        log.Println("All components stopped normally")
    case <-time.After(30 * time.Second):
        log.Println("Timeout! Some components failed to stop gracefully")
    }
    
    log.Println("Shutdown complete")
}

4. Config Hot-Reload

4.1 SIGHUP-Triggered Reload

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
type Config struct {
    CheckInterval time.Duration `toml:"check_interval"`
    AlertEmail    string        `toml:"alert_email"`
    // ...
}

var (
    config     *Config
    configLock sync.RWMutex
)

func loadConfig(path string) error {
    data, err := os.ReadFile(path)
    if err != nil {
        return err
    }
    
    var newConfig Config
    if _, err := toml.Decode(string(data), &newConfig); err != nil {
        return err
    }
    
    configLock.Lock()
    config = &newConfig
    configLock.Unlock()
    
    log.Printf("Config reloaded: %+v", newConfig)
    return nil
}

func getConfig() *Config {
    configLock.RLock()
    defer configLock.RUnlock()
    return config
}

// Config reload handler
func handleConfigReload(configPath string, reloadChan <-chan struct{}) {
    for range reloadChan {
        if err := loadConfig(configPath); err != nil {
            log.Printf("Config reload failed: %v", err)
        }
    }
}

4.2 Usage

1
2
3
# After modifying config file
kill -HUP $(pidof nas-agent)
# Service reloads config without restart

5. systemd Integration

5.1 Service File

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# /etc/systemd/system/nas-agent.service
[Unit]
Description=NAS System Agent
After=network.target

[Service]
Type=simple
User=root
ExecStart=/usr/local/bin/nas-agent -config /etc/nas-agent/config.toml
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s

# Resource limits
MemoryMax=512M
MemoryHigh=400M
CPUQuota=50%

# Graceful shutdown timeout
TimeoutStopSec=30s

# Logging
StandardOutput=journal
StandardError=journal
SyslogIdentifier=nas-agent

[Install]
WantedBy=multi-user.target

5.2 Key Config Explained

  • Type=simple: Process is ready as soon as it starts (typical for Go programs)
  • ExecReload: systemctl reload nas-agent sends SIGHUP
  • Restart=on-failure: Auto-restart on abnormal exit
  • TimeoutStopSec: Time to wait for graceful shutdown, then SIGKILL
  • MemoryMax/CPUQuota: Resource limits (via cgroups v2)

5.3 Log Integration

Using journald:

1
2
3
4
5
6
7
8
# View service logs
journalctl -u nas-agent -f

# View last 100 lines
journalctl -u nas-agent -n 100

# Query by time
journalctl -u nas-agent --since "2023-03-10 10:00" --until "2023-03-10 12:00"

In Go code, just use log or fmt output — journald captures it automatically.

6. Common Pitfalls

6.1 Goroutine Leaks

Wrong:

1
2
3
4
5
6
7
func (m *DiskMonitor) checkDisk(path string) {
    go func() {
        // If this blocks, goroutine never exits
        data, _ := exec.Command("smartctl", "-a", path).Output()
        m.process(data)
    }()
}

Correct:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
func (m *DiskMonitor) checkDisk(path string) {
    go func() {
        ctx, cancel := context.WithTimeout(m.ctx, 30*time.Second)
        defer cancel()
        
        cmd := exec.CommandContext(ctx, "smartctl", "-a", path)
        data, err := cmd.Output()
        if err != nil {
            if ctx.Err() == context.DeadlineExceeded {
                log.Printf("smartctl timeout: %s", path)
            }
            return
        }
        m.process(data)
    }()
}

6.2 Ignoring Context

Wrong:

1
2
3
4
5
6
7
func doWork() {
    for {
        // This loop never stops!
        time.Sleep(time.Second)
        doSomething()
    }
}

Correct:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
func doWork(ctx context.Context) {
    ticker := time.NewTicker(time.Second)
    defer ticker.Stop()
    
    for {
        select {
        case <-ctx.Done():
            return
        case <-ticker.C:
            doSomething()
        }
    }
}

6.3 Resource Cleanup Order

1
2
3
4
5
6
7
func shutdown(components []Component) {
    // Reverse order (last started, first stopped)
    for i := len(components) - 1; i >= 0; i-- {
        log.Printf("Stopping: %s", components[i].Name())
        components[i].Stop()
    }
}

7. Complete Example

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
package main

import (
    "context"
    "log"
    "os"
    "os/signal"
    "sync"
    "syscall"
    "time"
)

type Daemon struct {
    ctx        context.Context
    cancel     context.CancelFunc
    wg         sync.WaitGroup
    reloadChan chan struct{}
}

func NewDaemon() *Daemon {
    ctx, cancel := context.WithCancel(context.Background())
    return &Daemon{
        ctx:        ctx,
        cancel:     cancel,
        reloadChan: make(chan struct{}, 1),
    }
}

func (d *Daemon) Run() {
    // Start business goroutines
    d.wg.Add(2)
    go d.diskMonitorLoop()
    go d.taskSchedulerLoop()
    
    // Start signal handler
    d.handleSignals()
    
    // Wait for all goroutines to exit
    d.wg.Wait()
    log.Println("Daemon fully exited")
}

func (d *Daemon) diskMonitorLoop() {
    defer d.wg.Done()
    ticker := time.NewTicker(5 * time.Minute)
    defer ticker.Stop()
    
    for {
        select {
        case <-d.ctx.Done():
            log.Println("DiskMonitor: exiting")
            return
        case <-ticker.C:
            log.Println("DiskMonitor: checking disks...")
        }
    }
}

func (d *Daemon) taskSchedulerLoop() {
    defer d.wg.Done()
    ticker := time.NewTicker(time.Minute)
    defer ticker.Stop()
    
    for {
        select {
        case <-d.ctx.Done():
            log.Println("TaskScheduler: exiting")
            return
        case <-ticker.C:
            log.Println("TaskScheduler: checking scheduled tasks...")
        case <-d.reloadChan:
            log.Println("TaskScheduler: hot-reloading config")
        }
    }
}

func (d *Daemon) handleSignals() {
    sigChan := make(chan os.Signal, 1)
    signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
    
    for sig := range sigChan {
        switch sig {
        case syscall.SIGINT, syscall.SIGTERM:
            log.Printf("Received %v, starting graceful shutdown...", sig)
            d.cancel()
            return
        case syscall.SIGHUP:
            log.Println("Received SIGHUP, triggering config reload")
            select {
            case d.reloadChan <- struct{}{}:
            default:
                log.Println("Reload request already queued")
            }
        }
    }
}

func main() {
    log.SetFlags(log.LstdFlags | log.Lshortfile)
    log.Println("Daemon starting")
    
    daemon := NewDaemon()
    daemon.Run()
}

8. Summary

PointImplementation
Signal capturesignal.Notify
Cancel propagationcontext.WithCancel
Wait for exitsync.WaitGroup
Hot-reloadSIGHUP + channel notification
Timeout protectioncontext.WithTimeout
Resource limitssystemd cgroups

Core principle: Every goroutine must respond to context cancellation, every resource must have a cleanup mechanism.