@@ -22,14 +22,23 @@ import (
2222 "time"
2323
2424 "github.com/euank/go-kmsg-parser/kmsgparser"
25- "k8s.io/klog/v2"
25+ klog "k8s.io/klog/v2"
2626
2727 "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
2828 logtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
2929 "k8s.io/node-problem-detector/pkg/util"
3030 "k8s.io/node-problem-detector/pkg/util/tomb"
3131)
3232
33+ const (
34+ // retryDelay is the time to wait before attempting to restart the kmsg parser.
35+ retryDelay = 5 * time .Second
36+
37+ // RestartOnErrorKey is the configuration key to enable restarting
38+ // the kmsg parser when the channel closes due to an error.
39+ RestartOnErrorKey = "restartOnError"
40+ )
41+
3342type kernelLogWatcher struct {
3443 cfg types.WatcherConfig
3544 startTime time.Time
@@ -83,6 +92,12 @@ func (k *kernelLogWatcher) Stop() {
8392 k .tomb .Stop ()
8493}
8594
95+ // restartOnError checks if the restart on error configuration is enabled.
96+ func (k * kernelLogWatcher ) restartOnError () bool {
97+ value , exists := k .cfg .PluginConfig [RestartOnErrorKey ]
98+ return exists && value == "true"
99+ }
100+
86101// watchLoop is the main watch loop of kernel log watcher.
87102func (k * kernelLogWatcher ) watchLoop () {
88103 kmsgs := k .kmsgParser .Parse ()
@@ -102,7 +117,28 @@ func (k *kernelLogWatcher) watchLoop() {
102117 case msg , ok := <- kmsgs :
103118 if ! ok {
104119 klog .Error ("Kmsg channel closed" )
105- return
120+
121+ // Only attempt to restart if configured to do so
122+ if ! k .restartOnError () {
123+ klog .Infof ("Restart on error not enabled, stopping watcher" )
124+ return
125+ }
126+
127+ klog .Infof ("Attempting to restart kmsg parser" )
128+
129+ // Close the old parser
130+ if err := k .kmsgParser .Close (); err != nil {
131+ klog .Errorf ("Failed to close kmsg parser: %v" , err )
132+ }
133+
134+ // Try to restart with backoff
135+ var restarted bool
136+ kmsgs , restarted = k .retryCreateParser ()
137+ if ! restarted {
138+ // Stopping was signaled
139+ return
140+ }
141+ continue
106142 }
107143 klog .V (5 ).Infof ("got kernel message: %+v" , msg )
108144 if msg .Message == "" {
@@ -122,3 +158,26 @@ func (k *kernelLogWatcher) watchLoop() {
122158 }
123159 }
124160}
161+
162+ // retryCreateParser attempts to create a new kmsg parser.
163+ // It returns the new message channel and true on success, or nil and false if stopping was signaled.
164+ func (k * kernelLogWatcher ) retryCreateParser () (<- chan kmsgparser.Message , bool ) {
165+ for {
166+ select {
167+ case <- k .tomb .Stopping ():
168+ klog .Infof ("Stop watching kernel log during restart attempt" )
169+ return nil , false
170+ case <- time .After (retryDelay ):
171+ }
172+
173+ parser , err := kmsgparser .NewParser ()
174+ if err != nil {
175+ klog .Errorf ("Failed to create new kmsg parser, retrying in %v: %v" , retryDelay , err )
176+ continue
177+ }
178+
179+ k .kmsgParser = parser
180+ klog .Infof ("Successfully restarted kmsg parser" )
181+ return parser .Parse (), true
182+ }
183+ }
0 commit comments