Skip to content

Commit 024b1ca

Browse files
committed
[K8sDnsNameResolver] Fix Netty panic logging on JVM shutdown
Removes this erroneous log message on JVM shutdown: io.grpc.StatusRuntimeException: INTERNAL: Panic! This is a bug! Why it was logged: - it seems to happen when LookupSession.lookupAsync tries to establish a new DNS server connection after the JVM shutdown has already started (it could happen because JVM shutdown hooks are executed in unpredictable order) - under the hood that function tries to register a JVM shutdown hook to clean up NIO resources registering a shutdown hook during the shutdown sequence results in java.lang.IllegalStateException: Shutdown in progress - this exception is not reported as a failure case in LookupSession.lookupAsync result completion stage but is thrown directly on the method call - the code inside the resolver doesn't handle such direct exception gracefully, which causes it to propagate to Netty internals, resulting in a scary "Panic! This is a bug!" log message written to JUL by Netty internals The fix: - catch and handle the "error on JVM shutdown" case - reduce the log level for such errors below INFO - additionally, the resolver shutdown logic was hardened for all the cases where currently running resolution is in a race with the shutdown
1 parent 421ab9e commit 024b1ca

File tree

2 files changed

+136
-13
lines changed

2 files changed

+136
-13
lines changed

k8s-dns-name-resolver/src/main/java/com/evolution/jgrpc/tools/k8sdns/K8sDnsNameResolver.java

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
@Nullable private ScheduledHandle scheduledRefreshTask = null;
3535
@Nullable private SuccessResult lastSuccessfulResult = null;
3636
private boolean refreshing = false;
37+
private boolean isShuttingDown = false;
3738

3839
private record SuccessResult(List<InetAddress> addresses, Instant receiveTime) {}
3940

@@ -56,6 +57,7 @@ public String getServiceAuthority() {
5657

5758
@Override
5859
public void shutdown() {
60+
this.isShuttingDown = true;
5961
if (this.scheduledRefreshTask != null) {
6062
this.scheduledRefreshTask.cancel();
6163
}
@@ -95,12 +97,15 @@ private void startScheduledRefreshTask(long initialDelayMs) {
9597
}
9698

9799
private void refreshInner() {
98-
if (!this.refreshing) {
100+
if (!this.refreshing && !this.isShuttingDown) {
99101
this.refreshing = true;
100102
resolveAllAsync(
101103
(addresses, err) -> {
102104
try {
103-
if (err != null) {
105+
if (this.isShuttingDown
106+
|| err instanceof NameLookupState.LookupFailedJvmShutdownException) {
107+
handleResolutionResultDuringShutdown();
108+
} else if (err != null) {
104109
handleResolutionFailure(err);
105110
} else if (addresses != null && !addresses.isEmpty()) {
106111
handleResolutionSuccess(addresses);
@@ -116,6 +121,16 @@ private void refreshInner() {
116121
}
117122
}
118123

124+
private void handleResolutionResultDuringShutdown() {
125+
logger.info(
126+
"Received resolution result during shutdown, "
127+
+ "ignoring and cancelling future refresh attempts");
128+
if (this.scheduledRefreshTask != null) {
129+
this.scheduledRefreshTask.cancel();
130+
this.scheduledRefreshTask = null;
131+
}
132+
}
133+
119134
private void handleResolutionFailure(Throwable err) {
120135
/*
121136
NameResolver contract specifies that the client handles retries and their frequency.
@@ -159,7 +174,15 @@ private void resolveAllAsync(
159174
this.syncCtx.execute(
160175
() -> {
161176
if (err != null) {
162-
logger.error("DNS lookup failed", err);
177+
if (err instanceof NameLookupState.LookupFailedJvmShutdownException) {
178+
// lower the level below INFO if it's caused by JVM shutdown
179+
// to keep the shutdown logs clean;
180+
// the error log statement is not actionable in this case
181+
logger.debug(
182+
"DNS lookup failed due to JVM shutdown, nothing to do here", err);
183+
} else {
184+
logger.error("DNS lookup failed", err);
185+
}
163186
cb.accept(null, err);
164187
} else {
165188
this.nameLookupState = nameLookupState;

k8s-dns-name-resolver/src/main/java/com/evolution/jgrpc/tools/k8sdns/NameLookupState.java

Lines changed: 110 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
package com.evolution.jgrpc.tools.k8sdns;
22

33
import com.google.common.net.InetAddresses;
4+
import java.io.Serial;
45
import java.net.InetAddress;
56
import java.util.Collections;
67
import java.util.List;
78
import java.util.Optional;
89
import java.util.concurrent.CompletableFuture;
910
import java.util.concurrent.CompletionStage;
11+
import java.util.concurrent.locks.ReentrantLock;
1012
import java.util.function.Function;
1113
import org.jspecify.annotations.Nullable;
1214
import org.xbill.DNS.Name;
@@ -155,16 +157,34 @@ private CompletionStage<NameLookupState> lookupAlternativeAbsoluteNames(
155157
}
156158

157159
private CompletionStage<List<InetAddress>> lookupByAbsoluteName(Name name) {
158-
return session
159-
.lookupAsync(name, Type.A)
160-
.thenApply(
161-
(res) ->
162-
Optional.ofNullable(res).map(LookupResult::getRecords).orElse(List.of()).stream()
163-
.map(Record::rdataToString)
164-
.distinct()
165-
.sorted() // make sure that result comparison does not depend on order
166-
.map(InetAddresses::forString)
167-
.toList());
160+
CompletionStage<LookupResult> lookupAsyncResult;
161+
try {
162+
lookupAsyncResult = session.lookupAsync(name, Type.A);
163+
} catch (Exception e) {
164+
if (messageContainsIgnoreCase(e, "Shutdown in progress")
165+
// double-checking that we are in the middle of JVM shutdown process;
166+
// check LookupFailedJvmShutdownException description for details
167+
&& JvmShutdownDetector.isJvmShutdownDetected()) {
168+
lookupAsyncResult = CompletableFuture.failedFuture(new LookupFailedJvmShutdownException());
169+
} else {
170+
lookupAsyncResult = CompletableFuture.failedFuture(e);
171+
}
172+
}
173+
return lookupAsyncResult.thenApply(
174+
(res) ->
175+
Optional.ofNullable(res).map(LookupResult::getRecords).orElse(List.of()).stream()
176+
.map(Record::rdataToString)
177+
.distinct()
178+
.sorted() // make sure that result comparison does not depend on order
179+
.map(InetAddresses::forString)
180+
.toList());
181+
}
182+
183+
private static boolean messageContainsIgnoreCase(Throwable t, String str) {
184+
if (t.getMessage() == null) {
185+
return false;
186+
}
187+
return t.getMessage().toLowerCase().contains(str.toLowerCase());
168188
}
169189

170190
// org.xbill.DNS.lookup.LookupSession.safeConcat
@@ -183,4 +203,84 @@ private static Name concatFailIfInvalid(Name name, Name suffix) {
183203
throw new RuntimeException(e);
184204
}
185205
}
206+
207+
/**
208+
* Indicates that a DNS lookup failed due to JVM being in the process of shutdown.
209+
*
210+
* <p>This is a separate exception so the client code can lower the log level for this kind of
211+
* errors to keep the app shutdown logs clean.
212+
*
213+
* <p>Why such error could happen in {@link K8sDnsNameResolver}:
214+
*
215+
* <ul>
216+
* <li>When dnsjava's <code>LookupSession.lookupAsync</code> tries to establish a new DNS server
217+
* connection after the JVM shutdown has already started. It could happen because JVM
218+
* shutdown hooks are executed in unpredictable order.
219+
* <li>Under the hood, that function tries to register a JVM shutdown hook to clean up NIO
220+
* resources.
221+
* <li>Registering a shutdown hook during the shutdown sequence results in <code>
222+
* java.lang.IllegalStateException: Shutdown in progress</code>
223+
* <li>This exception is not reported as a failure case in the <code>LookupSession.lookupAsync
224+
* </code> result completion stage but is thrown directly on the method call
225+
* <li>If not caught, this error propagates to Netty internals , resulting in a scary "Panic!
226+
* This is a bug!" log message written to JUL by Netty internals
227+
* </ul>
228+
*/
229+
/* package */ static final class LookupFailedJvmShutdownException extends RuntimeException {
230+
231+
@Serial private static final long serialVersionUID = 1L;
232+
}
233+
234+
/**
235+
* A hack to detect if we are in the middle of JVM shutdown.
236+
*
237+
* <p>Shouldn't be called on a hot code path.
238+
*
239+
* <p>Check out {@link LookupFailedJvmShutdownException} for explanation why it's needed.
240+
*/
241+
private static final class JvmShutdownDetector {
242+
243+
private static volatile boolean JVM_SHUTDOWN_DETECTED = false;
244+
private static final ReentrantLock RUNTIME_POKE_LOCK = new ReentrantLock();
245+
246+
public static boolean isJvmShutdownDetected() {
247+
// serializing access to the actual detection method (pokeRuntimeIsInShutdown)
248+
// and caching the positive result to ensure we do not overload the JVM internal state
249+
if (JVM_SHUTDOWN_DETECTED) {
250+
return true;
251+
} else {
252+
RUNTIME_POKE_LOCK.lock();
253+
try {
254+
if (JVM_SHUTDOWN_DETECTED) {
255+
return true;
256+
}
257+
JVM_SHUTDOWN_DETECTED = pokeRuntimeIsInShutdown();
258+
return JVM_SHUTDOWN_DETECTED;
259+
} finally {
260+
RUNTIME_POKE_LOCK.unlock();
261+
}
262+
}
263+
}
264+
265+
private static boolean pokeRuntimeIsInShutdown() {
266+
var hook = new Thread(() -> {});
267+
try {
268+
Runtime.getRuntime().addShutdownHook(hook);
269+
Runtime.getRuntime().removeShutdownHook(hook);
270+
} catch (IllegalStateException e) {
271+
// java.lang.Runtime.addShutdownHook and java.lang.Runtime.removeShutdownHook
272+
// methods Javadoc specifies that IllegalStateException is thrown,
273+
// if called after JVM shutdown started
274+
return true;
275+
} catch (Error e) {
276+
// handling Error separately from other Throwable - Error should be propagated
277+
throw e;
278+
} catch (Throwable e) {
279+
// handling other remaining Throwable, just in case
280+
// we don't know the shutdown state here so returning false
281+
return false;
282+
}
283+
return false;
284+
}
285+
}
186286
}

0 commit comments

Comments
 (0)