Skip to content

Commit f1bdf8d

Browse files
committed
Restart kmsg on error
1 parent 3442408 commit f1bdf8d

File tree

2 files changed

+351
-4
lines changed

2 files changed

+351
-4
lines changed

pkg/systemlogmonitor/logwatchers/kmsg/log_watcher_linux.go

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,23 @@ import (
2222
"time"
2323

2424
"github.com/euank/go-kmsg-parser/kmsgparser"
25-
"k8s.io/klog/v2"
25+
klog "k8s.io/klog/v2"
2626

2727
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
2828
logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
2929
"k8s.io/node-problem-detector/pkg/util"
3030
"k8s.io/node-problem-detector/pkg/util/tomb"
3131
)
3232

33+
const (
34+
// retryDelay is the time to wait before attempting to restart the kmsg parser.
35+
retryDelay = 5 * time.Second
36+
37+
// RestartOnErrorKey is the configuration key to enable restarting
38+
// the kmsg parser when the channel closes due to an error.
39+
RestartOnErrorKey = "restartOnError"
40+
)
41+
3342
type kernelLogWatcher struct {
3443
cfg types.WatcherConfig
3544
startTime time.Time
@@ -83,6 +92,12 @@ func (k *kernelLogWatcher) Stop() {
8392
k.tomb.Stop()
8493
}
8594

95+
// restartOnError checks if the restart on error configuration is enabled.
96+
func (k *kernelLogWatcher) restartOnError() bool {
97+
value, exists := k.cfg.PluginConfig[RestartOnErrorKey]
98+
return exists && value == "true"
99+
}
100+
86101
// watchLoop is the main watch loop of kernel log watcher.
87102
func (k *kernelLogWatcher) watchLoop() {
88103
kmsgs := k.kmsgParser.Parse()
@@ -102,7 +117,28 @@ func (k *kernelLogWatcher) watchLoop() {
102117
case msg, ok := <-kmsgs:
103118
if !ok {
104119
klog.Error("Kmsg channel closed")
105-
return
120+
121+
// Only attempt to restart if configured to do so
122+
if !k.restartOnError() {
123+
klog.Infof("Restart on error not enabled, stopping watcher")
124+
return
125+
}
126+
127+
klog.Infof("Attempting to restart kmsg parser")
128+
129+
// Close the old parser
130+
if err := k.kmsgParser.Close(); err != nil {
131+
klog.Errorf("Failed to close kmsg parser: %v", err)
132+
}
133+
134+
// Try to restart with backoff
135+
var restarted bool
136+
kmsgs, restarted = k.retryCreateParser()
137+
if !restarted {
138+
// Stopping was signaled
139+
return
140+
}
141+
continue
106142
}
107143
klog.V(5).Infof("got kernel message: %+v", msg)
108144
if msg.Message == "" {
@@ -122,3 +158,26 @@ func (k *kernelLogWatcher) watchLoop() {
122158
}
123159
}
124160
}
161+
162+
// retryCreateParser attempts to create a new kmsg parser.
163+
// It returns the new message channel and true on success, or nil and false if stopping was signaled.
164+
func (k *kernelLogWatcher) retryCreateParser() (<-chan kmsgparser.Message, bool) {
165+
for {
166+
select {
167+
case <-k.tomb.Stopping():
168+
klog.Infof("Stop watching kernel log during restart attempt")
169+
return nil, false
170+
case <-time.After(retryDelay):
171+
}
172+
173+
parser, err := kmsgparser.NewParser()
174+
if err != nil {
175+
klog.Errorf("Failed to create new kmsg parser, retrying in %v: %v", retryDelay, err)
176+
continue
177+
}
178+
179+
k.kmsgParser = parser
180+
klog.Infof("Successfully restarted kmsg parser")
181+
return parser.Parse(), true
182+
}
183+
}

0 commit comments

Comments
 (0)