Skip to content

Commit 0cea926

Browse files
authored
Merge pull request #135 from nats-io/pid-signal-fixes
Fixes to config reloader for when server pid changes
2 parents 24881bc + 1c04564 commit 0cea926

File tree

4 files changed

+64
-26
lines changed

4 files changed

+64
-26
lines changed

Diff for: cmd/jetstream-controller/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2020-2022 The NATS Authors
1+
// Copyright 2020-2023 The NATS Authors
22
// Licensed under the Apache License, Version 2.0 (the "License");
33
// you may not use this file except in compliance with the License.
44
// You may obtain a copy of the License at

Diff for: cmd/nats-server-config-reloader/main.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ func main() {
5858
fs.StringVar(&nconfig.PidFile, "pid", "/var/run/nats/gnatsd.pid", "NATS Server Pid File")
5959
fs.Var(&fileSet, "c", "NATS Server Config File (may be repeated to specify more than one)")
6060
fs.Var(&fileSet, "config", "NATS Server Config File (may be repeated to specify more than one)")
61-
fs.IntVar(&nconfig.MaxRetries, "max-retries", 5, "Max attempts to trigger reload")
62-
fs.IntVar(&nconfig.RetryWaitSecs, "retry-wait-secs", 2, "Time to back off when reloading fails before retrying")
61+
fs.IntVar(&nconfig.MaxRetries, "max-retries", 30, "Max attempts to trigger reload")
62+
fs.IntVar(&nconfig.RetryWaitSecs, "retry-wait-secs", 4, "Time to back off when reloading fails before retrying")
6363
fs.IntVar(&customSignal, "signal", 1, "Signal to send to the NATS Server process (default SIGHUP 1)")
6464

6565
fs.Parse(os.Args[1:])

Diff for: pkg/natsreloader/natsreloader.go

+48-23
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
// Copyright 2020-2023 The NATS Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
114
package natsreloader
215

316
import (
@@ -12,6 +25,7 @@ import (
1225
"path/filepath"
1326
"sort"
1427
"strconv"
28+
"syscall"
1529
"time"
1630

1731
"github.com/fsnotify/fsnotify"
@@ -61,14 +75,21 @@ func (r *Reloader) waitForProcess() error {
6175
goto WaitAndRetry
6276
}
6377

78+
// This always succeeds regardless of the process existing or not.
6479
proc, err = os.FindProcess(pid)
6580
if err != nil {
6681
goto WaitAndRetry
6782
}
83+
84+
// Check if the process is still alive.
85+
err = proc.Signal(syscall.Signal(0))
86+
if err != nil {
87+
goto WaitAndRetry
88+
}
6889
break
6990

7091
WaitAndRetry:
71-
log.Printf(errorFmt, err)
92+
log.Printf("Error while monitoring pid %v: %v", pid, err)
7293
attempts++
7394
if attempts > r.MaxRetries {
7495
return fmt.Errorf("too many errors attempting to find server process")
@@ -77,11 +98,9 @@ func (r *Reloader) waitForProcess() error {
7798
}
7899

79100
if attempts > 0 {
80-
log.Printf("found pid from pidfile %q after %v failed attempts (%v time after start)",
81-
r.PidFile, attempts, time.Since(startTime))
101+
log.Printf("Found pid from pidfile %q after %v failed attempts (took %.3fs)",
102+
r.PidFile, attempts, time.Since(startTime).Seconds())
82103
}
83-
84-
r.pid = pid
85104
r.proc = proc
86105
return nil
87106
}
@@ -167,8 +186,10 @@ func handleEvents(configWatcher *fsnotify.Watcher, event fsnotify.Event, lastCon
167186
}
168187
}
169188

170-
func handleDeletedFiles(deletedFiles []string, configWatcher *fsnotify.Watcher, lastConfigAppliedCache map[string][]byte) ([]string, []string) {
171-
log.Printf("Ticker is running with deletedFiles %v", deletedFiles)
189+
func handleDeletedFiles(deletedFiles []string, configWatcher *fsnotify.Watcher, lastConfigAppliedCache map[string][]byte) ([]string, []string) {
190+
if len(deletedFiles) > 0 {
191+
log.Printf("Tracking files %v", deletedFiles)
192+
}
172193
newDeletedFiles := make([]string, 0, len(deletedFiles))
173194
updated := make([]string, 0, len(deletedFiles))
174195
for _, f := range deletedFiles {
@@ -212,7 +233,7 @@ func (r *Reloader) init() (*fsnotify.Watcher, map[string][]byte, error) {
212233
}
213234

214235
// lastConfigAppliedCache is the last config update
215-
// applied by us
236+
// applied by us.
216237
lastConfigAppliedCache := make(map[string][]byte)
217238

218239
// Preload config hashes, so we know their digests
@@ -225,13 +246,8 @@ func (r *Reloader) init() (*fsnotify.Watcher, map[string][]byte, error) {
225246
}
226247
lastConfigAppliedCache[configFile] = digest
227248
}
228-
229-
// If the two pids don't match then os.FindProcess() has done something
230-
// rather hinkier than we expect, but log them both just in case on some
231-
// future platform there's a weird namespace issue, as a difference will
232-
// help with debugging.
233-
log.Printf("Live, ready to kick pid %v (live, from %v spec) based on any of %v files",
234-
r.proc.Pid, r.pid, len(lastConfigAppliedCache))
249+
log.Printf("Live, ready to kick pid %v on config changes (files=%d)",
250+
r.proc.Pid, len(lastConfigAppliedCache))
235251

236252
if len(lastConfigAppliedCache) == 0 {
237253
log.Printf("Error: no watched config files cached; input spec was: %#v",
@@ -243,17 +259,26 @@ func (r *Reloader) init() (*fsnotify.Watcher, map[string][]byte, error) {
243259
func (r *Reloader) reload(updatedFiles []string) error {
244260
attempts := 0
245261
for {
246-
log.Printf("Sending signal '%s' to server to reload configuration due to: %s", r.Signal.String(), updatedFiles)
247-
err := r.proc.Signal(r.Signal)
262+
err := r.waitForProcess()
263+
if err != nil {
264+
goto Retry
265+
}
266+
267+
log.Printf("Sending pid %v '%s' signal to reload changes from: %s", r.proc.Pid, r.Signal.String(), updatedFiles)
268+
err = r.proc.Signal(r.Signal)
248269
if err == nil {
249270
return nil
250271
}
251-
log.Printf("Error during reload: %s\n", err)
272+
273+
Retry:
274+
if err != nil {
275+
log.Printf("Error during reload: %s", err)
276+
}
252277
if attempts > r.MaxRetries {
253278
return fmt.Errorf("too many errors (%v) attempting to signal server to reload: %w", attempts, err)
254279
}
255280
delay := retryJitter(time.Duration(r.RetryWaitSecs) * time.Second)
256-
log.Printf("Wait and retrying after some time [%v] ...", delay)
281+
log.Printf("Wait and retrying in %.3fs ...", delay.Seconds())
257282
time.Sleep(delay)
258283
attempts++
259284
}
@@ -287,23 +312,23 @@ func (r *Reloader) Run(ctx context.Context) error {
287312
case <-t.C:
288313
updatedFiles, deletedFiles = handleDeletedFiles(deletedFiles, configWatcher, lastConfigAppliedCache)
289314
if len(deletedFiles) == 0 {
290-
// No more deleted files, stop the ticker
315+
log.Printf("All monitored files detected.")
291316
t.Stop()
292317
tickerRunning = false
293318
}
294319
if len(updatedFiles) > 0 {
295-
// Send signal to reload the config
320+
// Send signal to reload the config.
296321
log.Printf("Updated files: %v", updatedFiles)
297322
break
298323
}
299324
continue
300-
// Check if the process is still alive
301325
case event := <-configWatcher.Events:
302326
updated, deleted := handleEvents(configWatcher, event, lastConfigAppliedCache)
303327
updatedFiles = removeDuplicateStrings(updated)
304328
deletedFiles = removeDuplicateStrings(append(deletedFiles, deleted...))
305329
if !tickerRunning {
306-
// Start the ticker to re-add deleted files
330+
// Start the ticker to re-add deleted files.
331+
log.Printf("Starting ticker to re-add all tracked files.")
307332
t.Reset(time.Second)
308333
tickerRunning = true
309334
}

Diff for: pkg/natsreloader/natsreloadertest/natsreloader_test.go

+13
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
// Copyright 2020-2023 The NATS Authors
2+
// Licensed under the Apache License, Version 2.0 (the "License");
3+
// you may not use this file except in compliance with the License.
4+
// You may obtain a copy of the License at
5+
//
6+
// http://www.apache.org/licenses/LICENSE-2.0
7+
//
8+
// Unless required by applicable law or agreed to in writing, software
9+
// distributed under the License is distributed on an "AS IS" BASIS,
10+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
// See the License for the specific language governing permissions and
12+
// limitations under the License.
13+
114
package natsreloadertest
215

316
import (

0 commit comments

Comments
 (0)