For a NAS product, I needed to develop a 24/7 system daemon for disk health monitoring and scheduled task management. This post covers the key technical aspects: signal handling, graceful shutdown, and config hot-reload.
1. Why Write Your Own Daemon?
1.1 Use Case
The NAS device needs a “caretaker” process:
- Periodic disk SMART status checks
- RAID array health monitoring
- Scheduled backup tasks
- OOM protection (proactively clear cache when memory is tight)
1.2 Why Not Just Use cron?
cron can’t handle:
- Needs to stay resident in memory, maintain state
- Tasks have dependencies
- Need to react immediately to system events (like disk plug/unplug)
- Need custom retry logic
So we need a long-running daemon.
2. Basic Skeleton
2.1 Minimal Daemon Structure
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
| package main
import (
"context"
"log"
"os"
"os/signal"
"syscall"
)
func main() {
// Create cancellable context
ctx, cancel := context.WithCancel(context.Background())
// Start business logic
go runDaemon(ctx)
// Wait for shutdown signal
waitForShutdown(cancel)
}
func runDaemon(ctx context.Context) {
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
log.Println("Received shutdown signal, stopping...")
return
case <-ticker.C:
doPeriodicWork()
}
}
}
func waitForShutdown(cancel context.CancelFunc) {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
sig := <-sigChan
log.Printf("Received signal %v, starting graceful shutdown...", sig)
cancel()
// Give business logic time to clean up
time.Sleep(5 * time.Second)
log.Println("Shutdown complete")
}
|
2.2 Signal Handling Deep Dive
Common signals and how to handle them:
| Signal | Trigger | Recommended Action |
|---|
| SIGINT (2) | Ctrl+C | Graceful shutdown |
| SIGTERM (15) | kill / systemd stop | Graceful shutdown |
| SIGKILL (9) | kill -9 | Can’t be caught! |
| SIGHUP (1) | Terminal disconnect / custom | Hot-reload config |
| SIGUSR1 (10) | User-defined | Print status/dump |
| SIGUSR2 (12) | User-defined | Toggle log level |
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| func setupSignalHandlers(ctx context.Context, cancel context.CancelFunc, configReload chan struct{}) {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan,
syscall.SIGINT,
syscall.SIGTERM,
syscall.SIGHUP,
syscall.SIGUSR1,
)
go func() {
for sig := range sigChan {
switch sig {
case syscall.SIGINT, syscall.SIGTERM:
log.Printf("Received %v, starting graceful shutdown", sig)
cancel()
return
case syscall.SIGHUP:
log.Println("Received SIGHUP, triggering hot-reload")
configReload <- struct{}{}
case syscall.SIGUSR1:
log.Println("Received SIGUSR1, printing status")
printDaemonStatus()
}
}
}()
}
|
3. Graceful Shutdown
3.1 Why Is Graceful Shutdown Necessary?
If a process is killed immediately:
- Files being written may be corrupted
- Database connections not closed properly
- Network requests left hanging, clients timeout
- Temp files not cleaned up
3.2 Using Context for Cancellation Propagation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
| type DiskMonitor struct {
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
}
func NewDiskMonitor(parentCtx context.Context) *DiskMonitor {
ctx, cancel := context.WithCancel(parentCtx)
return &DiskMonitor{
ctx: ctx,
cancel: cancel,
}
}
func (m *DiskMonitor) Start() {
m.wg.Add(1)
go func() {
defer m.wg.Done()
m.run()
}()
}
func (m *DiskMonitor) run() {
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-m.ctx.Done():
log.Println("DiskMonitor: received cancel signal")
m.cleanup()
return
case <-ticker.C:
m.checkAllDisks()
}
}
}
func (m *DiskMonitor) cleanup() {
log.Println("DiskMonitor: cleaning up resources...")
// Close database connections
// Flush cache to disk
// etc...
}
func (m *DiskMonitor) Stop() {
log.Println("DiskMonitor: stop requested")
m.cancel()
m.wg.Wait() // Wait for goroutine to actually exit
log.Println("DiskMonitor: fully stopped")
}
|
3.3 Timeout-Forced Shutdown
Graceful shutdown can’t wait forever:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| func gracefulShutdown(cancel context.CancelFunc, components []Stoppable) {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
<-sigChan
log.Println("Starting graceful shutdown...")
// Notify all components to stop
cancel()
// Set timeout
done := make(chan struct{})
go func() {
for _, c := range components {
c.Stop() // Assuming each component has Stop() method
}
close(done)
}()
select {
case <-done:
log.Println("All components stopped normally")
case <-time.After(30 * time.Second):
log.Println("Timeout! Some components failed to stop gracefully")
}
log.Println("Shutdown complete")
}
|
4. Config Hot-Reload
4.1 SIGHUP-Triggered Reload
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
| type Config struct {
CheckInterval time.Duration `toml:"check_interval"`
AlertEmail string `toml:"alert_email"`
// ...
}
var (
config *Config
configLock sync.RWMutex
)
func loadConfig(path string) error {
data, err := os.ReadFile(path)
if err != nil {
return err
}
var newConfig Config
if _, err := toml.Decode(string(data), &newConfig); err != nil {
return err
}
configLock.Lock()
config = &newConfig
configLock.Unlock()
log.Printf("Config reloaded: %+v", newConfig)
return nil
}
func getConfig() *Config {
configLock.RLock()
defer configLock.RUnlock()
return config
}
// Config reload handler
func handleConfigReload(configPath string, reloadChan <-chan struct{}) {
for range reloadChan {
if err := loadConfig(configPath); err != nil {
log.Printf("Config reload failed: %v", err)
}
}
}
|
4.2 Usage
1
2
3
| # After modifying config file
kill -HUP $(pidof nas-agent)
# Service reloads config without restart
|
5. systemd Integration
5.1 Service File
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| # /etc/systemd/system/nas-agent.service
[Unit]
Description=NAS System Agent
After=network.target
[Service]
Type=simple
User=root
ExecStart=/usr/local/bin/nas-agent -config /etc/nas-agent/config.toml
ExecReload=/bin/kill -HUP $MAINPID
Restart=on-failure
RestartSec=5s
# Resource limits
MemoryMax=512M
MemoryHigh=400M
CPUQuota=50%
# Graceful shutdown timeout
TimeoutStopSec=30s
# Logging
StandardOutput=journal
StandardError=journal
SyslogIdentifier=nas-agent
[Install]
WantedBy=multi-user.target
|
5.2 Key Config Explained
Type=simple: Process is ready as soon as it starts (typical for Go programs)ExecReload: systemctl reload nas-agent sends SIGHUPRestart=on-failure: Auto-restart on abnormal exitTimeoutStopSec: Time to wait for graceful shutdown, then SIGKILLMemoryMax/CPUQuota: Resource limits (via cgroups v2)
5.3 Log Integration
Using journald:
1
2
3
4
5
6
7
8
| # View service logs
journalctl -u nas-agent -f
# View last 100 lines
journalctl -u nas-agent -n 100
# Query by time
journalctl -u nas-agent --since "2023-03-10 10:00" --until "2023-03-10 12:00"
|
In Go code, just use log or fmt output — journald captures it automatically.
6. Common Pitfalls
6.1 Goroutine Leaks
Wrong:
1
2
3
4
5
6
7
| func (m *DiskMonitor) checkDisk(path string) {
go func() {
// If this blocks, goroutine never exits
data, _ := exec.Command("smartctl", "-a", path).Output()
m.process(data)
}()
}
|
Correct:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
| func (m *DiskMonitor) checkDisk(path string) {
go func() {
ctx, cancel := context.WithTimeout(m.ctx, 30*time.Second)
defer cancel()
cmd := exec.CommandContext(ctx, "smartctl", "-a", path)
data, err := cmd.Output()
if err != nil {
if ctx.Err() == context.DeadlineExceeded {
log.Printf("smartctl timeout: %s", path)
}
return
}
m.process(data)
}()
}
|
6.2 Ignoring Context
Wrong:
1
2
3
4
5
6
7
| func doWork() {
for {
// This loop never stops!
time.Sleep(time.Second)
doSomething()
}
}
|
Correct:
1
2
3
4
5
6
7
8
9
10
11
12
13
| func doWork(ctx context.Context) {
ticker := time.NewTicker(time.Second)
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
doSomething()
}
}
}
|
6.3 Resource Cleanup Order
1
2
3
4
5
6
7
| func shutdown(components []Component) {
// Reverse order (last started, first stopped)
for i := len(components) - 1; i >= 0; i-- {
log.Printf("Stopping: %s", components[i].Name())
components[i].Stop()
}
}
|
7. Complete Example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
| package main
import (
"context"
"log"
"os"
"os/signal"
"sync"
"syscall"
"time"
)
type Daemon struct {
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
reloadChan chan struct{}
}
func NewDaemon() *Daemon {
ctx, cancel := context.WithCancel(context.Background())
return &Daemon{
ctx: ctx,
cancel: cancel,
reloadChan: make(chan struct{}, 1),
}
}
func (d *Daemon) Run() {
// Start business goroutines
d.wg.Add(2)
go d.diskMonitorLoop()
go d.taskSchedulerLoop()
// Start signal handler
d.handleSignals()
// Wait for all goroutines to exit
d.wg.Wait()
log.Println("Daemon fully exited")
}
func (d *Daemon) diskMonitorLoop() {
defer d.wg.Done()
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-d.ctx.Done():
log.Println("DiskMonitor: exiting")
return
case <-ticker.C:
log.Println("DiskMonitor: checking disks...")
}
}
}
func (d *Daemon) taskSchedulerLoop() {
defer d.wg.Done()
ticker := time.NewTicker(time.Minute)
defer ticker.Stop()
for {
select {
case <-d.ctx.Done():
log.Println("TaskScheduler: exiting")
return
case <-ticker.C:
log.Println("TaskScheduler: checking scheduled tasks...")
case <-d.reloadChan:
log.Println("TaskScheduler: hot-reloading config")
}
}
}
func (d *Daemon) handleSignals() {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
for sig := range sigChan {
switch sig {
case syscall.SIGINT, syscall.SIGTERM:
log.Printf("Received %v, starting graceful shutdown...", sig)
d.cancel()
return
case syscall.SIGHUP:
log.Println("Received SIGHUP, triggering config reload")
select {
case d.reloadChan <- struct{}{}:
default:
log.Println("Reload request already queued")
}
}
}
}
func main() {
log.SetFlags(log.LstdFlags | log.Lshortfile)
log.Println("Daemon starting")
daemon := NewDaemon()
daemon.Run()
}
|
8. Summary
| Point | Implementation |
|---|
| Signal capture | signal.Notify |
| Cancel propagation | context.WithCancel |
| Wait for exit | sync.WaitGroup |
| Hot-reload | SIGHUP + channel notification |
| Timeout protection | context.WithTimeout |
| Resource limits | systemd cgroups |
Core principle: Every goroutine must respond to context cancellation, every resource must have a cleanup mechanism.