From d032a53daef77ad7b42c270b3d8617f3a59518c8 Mon Sep 17 00:00:00 2001 From: mprokopchuk Date: Fri, 10 Apr 2026 11:39:45 +0530 Subject: [PATCH 1/2] Indirect agent connection improvements, includes the following improvements. - Enhances the Host connecting logic to avoid connecting storm (where Agent opens multiple sockets against Management Server). - Implements HostConnectProcess task where Host upon connection checks whether lock is available, traces Host connecting progress, status and timeout. - Introduces AgentConnectStatusCommand, where Host checks whether lock for the Host is available (i.e. "previous" connect process is finished). - Implementes logic to check whether Management Server has lock against Host (exposed MySQL DB lock presence via API) - Removes synchronization on Host disconnect process, double-disconnect logic in clustered Management Server environment, added early removal from ping map (in case of combination ping timeout delay + synchronized disconnect process the Agent Manager submits more disconnect requests) - Introduces parameterized connection and status check timeouts - Implements backoff algorithm abstraction - can be used either constant backoff timeout or exponential with jitter to wait between connection Host attempts to Management Server - Implements ServerAttache to be used on the Agent side of communication (similar to Attache on Management Server side) - Enhances/Adds logs significantly to Host Agent and Agent Manager logic to trace Host connecting and disconnecting process, including ids, names, context UUIDs and timings (how much time took overall initialization/deinitialization) - Adds logs to communication between Management Servers (PDU requests) - Adds DB indexes to improve search performance, uses IDEMPOTENT_ADD_INDEX for safer DB schema updates --- .../src/main/java/com/cloud/agent/Agent.java | 810 ++++++++++------ .../main/java/com/cloud/agent/AgentShell.java | 33 +- .../com/cloud/agent/HostConnectProcess.java | 355 +++++++ .../java/com/cloud/agent/IAgentShell.java | 2 + .../java/com/cloud/agent/ServerAttache.java | 476 ++++++++++ .../java/com/cloud/agent/ServerListener.java | 91 ++ .../com/cloud/agent/SynchronousListener.java | 102 ++ .../agent/properties/AgentProperties.java | 23 +- .../test/java/com/cloud/agent/AgentTest.java | 82 +- .../cloud/agent/HostConnectProcessTest.java | 59 ++ .../agent/api/AgentConnectStatusAnswer.java | 65 ++ .../agent/api/AgentConnectStatusCommand.java | 58 ++ .../com/cloud/agent/api/StartupAnswer.java | 25 + .../com/cloud/agent/transport/Request.java | 2 +- .../com/cloud/resource/ServerResource.java | 3 +- .../threadcontext/ThreadContextUtil.java | 59 ++ .../com/cloud/agent/manager/AgentAttache.java | 8 +- .../cloud/agent/manager/AgentManagerImpl.java | 875 +++++++++++++++--- .../manager/ClusteredAgentManagerImpl.java | 81 +- .../agent/manager/AgentManagerImplTest.java | 168 +++- .../ClusteredAgentManagerImplTest.java | 114 ++- .../procedures/cloud.idempotent_add_index.sql | 28 + .../META-INF/db/schema-42210to42300.sql | 14 + .../ClusterServiceServletHttpHandler.java | 14 +- .../cluster/ClusterServiceServletImpl.java | 23 +- .../cluster/dao/ManagementServerHostDao.java | 5 + .../dao/ManagementServerHostDaoImpl.java | 5 + .../framework/config/ConfigKeyUtil.java | 61 ++ .../main/java/com/cloud/utils/db/DbUtil.java | 69 +- .../java/com/cloud/utils/db/GlobalLock.java | 263 ++++-- .../agent/lb/IndirectAgentLBServiceImpl.java | 4 + .../main/java/com/cloud/utils/DateUtil.java | 33 + .../main/java/com/cloud/utils/LogUtils.java | 27 + .../cloud/utils/backoff/BackoffAlgorithm.java | 3 + .../cloud/utils/backoff/BackoffFactory.java | 96 ++ .../backoff/impl/ConstantTimeBackoff.java | 24 +- .../impl/ExponentialWithJitterBackoff.java | 166 ++++ .../ExponentialWithJitterBackoffMBean.java | 32 + .../utils/exception/CSExceptionErrorCode.java | 1 + .../java/com/cloud/utils/net/NetUtils.java | 3 + .../com/cloud/utils/nio/HandlerFactory.java | 6 +- .../main/java/com/cloud/utils/nio/Link.java | 56 +- .../java/com/cloud/utils/nio/NioClient.java | 151 ++- .../com/cloud/utils/nio/NioConnection.java | 292 +++--- .../java/com/cloud/utils/nio/NioServer.java | 10 +- .../backoff/impl/ConstantTimeBackoffTest.java | 2 +- .../com/cloud/utils/testcase/NioTest.java | 5 +- 47 files changed, 4106 insertions(+), 778 deletions(-) create mode 100644 agent/src/main/java/com/cloud/agent/HostConnectProcess.java create mode 100644 agent/src/main/java/com/cloud/agent/ServerAttache.java create mode 100644 agent/src/main/java/com/cloud/agent/ServerListener.java create mode 100644 agent/src/main/java/com/cloud/agent/SynchronousListener.java create mode 100644 agent/src/test/java/com/cloud/agent/HostConnectProcessTest.java create mode 100644 core/src/main/java/com/cloud/agent/api/AgentConnectStatusAnswer.java create mode 100644 core/src/main/java/com/cloud/agent/api/AgentConnectStatusCommand.java create mode 100644 core/src/main/java/org/apache/cloudstack/threadcontext/ThreadContextUtil.java create mode 100644 engine/schema/src/main/resources/META-INF/db/procedures/cloud.idempotent_add_index.sql create mode 100644 framework/config/src/main/java/org/apache/cloudstack/framework/config/ConfigKeyUtil.java create mode 100644 utils/src/main/java/com/cloud/utils/backoff/BackoffFactory.java create mode 100644 utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoff.java create mode 100644 utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoffMBean.java diff --git a/agent/src/main/java/com/cloud/agent/Agent.java b/agent/src/main/java/com/cloud/agent/Agent.java index 275fd41edc34..63ba0e27242f 100644 --- a/agent/src/main/java/com/cloud/agent/Agent.java +++ b/agent/src/main/java/com/cloud/agent/Agent.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; +import java.net.ConnectException; import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.Socket; @@ -31,6 +32,9 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.UUID; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -47,6 +51,13 @@ import javax.naming.ConfigurationException; +import com.cloud.agent.api.AgentConnectStatusAnswer; +import com.cloud.agent.api.AgentConnectStatusCommand; +import com.cloud.agent.properties.AgentProperties; +import com.cloud.utils.LogUtils; +import com.cloud.utils.UuidUtils; +import com.cloud.utils.backoff.BackoffFactory; +import com.cloud.utils.net.NetUtils; import org.apache.cloudstack.agent.lb.SetupMSListAnswer; import org.apache.cloudstack.agent.lb.SetupMSListCommand; import org.apache.cloudstack.ca.PostCertificateRenewalCommand; @@ -58,6 +69,7 @@ import org.apache.cloudstack.utils.security.KeyStoreUtils; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.ObjectUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -83,7 +95,6 @@ import com.cloud.exception.AgentControlChannelException; import com.cloud.host.Host; import com.cloud.resource.AgentStatusUpdater; -import com.cloud.resource.ResourceStatusUpdater; import com.cloud.resource.ServerResource; import com.cloud.utils.NumbersUtil; import com.cloud.utils.PropertiesUtil; @@ -91,7 +102,6 @@ import com.cloud.utils.concurrency.NamedThreadFactory; import com.cloud.utils.exception.CloudRuntimeException; import com.cloud.utils.exception.NioConnectionException; -import com.cloud.utils.exception.TaskExecutionException; import com.cloud.utils.nio.HandlerFactory; import com.cloud.utils.nio.Link; import com.cloud.utils.nio.NioClient; @@ -99,6 +109,8 @@ import com.cloud.utils.nio.Task; import com.cloud.utils.script.Script; +import static com.cloud.agent.HostConnectProcess.DEFAULT_ASYNC_COMMAND_TIMEOUT_SEC; + /** * @config * {@table @@ -115,6 +127,18 @@ public class Agent implements HandlerFactory, IAgentControl, AgentStatusUpdater { protected Logger logger = LogManager.getLogger(getClass()); + private static final int HOST_SOCKET_CONNECT_TIMEOUT_MS = 5000; + + /** + * Constant to verify {@link ConnectException} cause. + */ + private static final String CONNECTION_REFUSED_MSG = "Connection refused"; + + /** + * Constant to verify "probably rejected by MS due to {@link NioConnection#rejectConnectionIfBusy}". + */ + private static final String BROKEN_PIPE_MSG = "Broken pipe"; + public enum ExitStatus { Normal(0), // Normal status = 0. Upgrade(65), // Exiting for upgrade. @@ -142,7 +166,7 @@ public int value() { String _uuid; String _name; - ScheduledExecutorService selfTaskExecutor; + ScheduledExecutorService watchTaskExecutor; ScheduledExecutorService certExecutor; ScheduledExecutorService hostLbCheckExecutor; @@ -152,9 +176,7 @@ public int value() { long pingInterval = 0; AtomicInteger commandsInProgress = new AtomicInteger(0); - private final AtomicReference startupTask = new AtomicReference<>(); - private static final long DEFAULT_STARTUP_WAIT = 180; - long startupWait = DEFAULT_STARTUP_WAIT; + private static final AtomicBoolean RECONNECT_LOCK = new AtomicBoolean(false); boolean reconnectAllowed = true; //For time sensitive task, e.g. PingTask @@ -166,18 +188,12 @@ public int value() { private String keystoreSetupSetupPath; private String keystoreCertImportScriptPath; - private String hostname; + String hostname; - protected String getLinkLog(final Link link) { - if (link == null) { - return ""; - } - StringBuilder str = new StringBuilder(); - if (logger.isTraceEnabled()) { - str.append(System.identityHashCode(link)).append("-"); - } - str.append(link.getSocketAddress()); - return str.toString(); + HostConnectProcess hostConnectProcess = new HostConnectProcess(this); + + public ExecutorService getRequestHandler() { + return requestHandler; } protected String getAgentName() { @@ -190,7 +206,7 @@ protected String getAgentName() { protected void setupShutdownHookAndInitExecutors() { logger.trace("Adding shutdown hook"); Runtime.getRuntime().addShutdownHook(shutdownThread); - selfTaskExecutor = Executors.newScheduledThreadPool(1, new NamedThreadFactory("Agent-SelfTask")); + watchTaskExecutor = Executors.newScheduledThreadPool(1, new NamedThreadFactory("Agent-WatchTask")); outRequestHandler = new ThreadPoolExecutor(shell.getPingRetries(), 2 * shell.getPingRetries(), 10, TimeUnit.MINUTES, new SynchronousQueue<>(), new NamedThreadFactory("AgentOutRequest-Handler")); requestHandler = new ThreadPoolExecutor(shell.getWorkers(), 5 * shell.getWorkers(), 1, TimeUnit.DAYS, @@ -240,14 +256,9 @@ public Agent(final IAgentShell shell, final int localAgentId, final ServerResour throw new ConfigurationException("Unable to configure " + serverResource.getName()); } ThreadContext.put("agentname", getAgentName()); - final String host = this.shell.getNextHost(); - connection = new NioClient(getAgentName(), host, this.shell.getPort(), this.shell.getWorkers(), - this.shell.getSslHandshakeTimeout(), this); setupShutdownHookAndInitExecutors(); - logger.info("{} with host = {}, local id = {}", this, host, localAgentId); } - @Override public String toString() { return String.format("Agent [id = %s, uuid = %s, name = %s, type = %s, zone = %s, pod = %s, workers = %d, port = %d]", @@ -307,6 +318,7 @@ private void scavengeOldAgentObjects() { } public void start() { + ThreadContext.put("logcontextid", UuidUtils.first(UUID.randomUUID().toString())); if (!serverResource.start()) { String msg = String.format("Unable to start the resource: %s", serverResource.getName()); logger.error(msg); @@ -323,25 +335,8 @@ public void start() { throw new CloudRuntimeException(String.format("Unable to find the '%s' script", KeyStoreUtils.KS_IMPORT_SCRIPT)); } - try { - connection.start(); - } catch (final NioConnectionException e) { - logger.warn("Attempt to connect to server generated NIO Connection Exception {}, trying again", e.getLocalizedMessage()); - } - while (!connection.isStartup()) { - final String host = shell.getNextHost(); - shell.getBackoffAlgorithm().waitBeforeRetry(); - connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), - shell.getSslHandshakeTimeout(), this); - logger.info("Connecting to host: {}", host); - try { - connection.start(); - } catch (final NioConnectionException e) { - stopAndCleanupConnection(false); - logger.info("Attempted to connect to the server, but received an unexpected exception, trying again...", e); - } - } - shell.updateConnectedHost(((NioClient)connection).getHost()); + reconnect(null, null, false); + scavengeOldAgentObjects(); } @@ -356,9 +351,9 @@ public void stop(final String reason, final String detail) { link.send(req.toBytes()); } } catch (final ClosedChannelException e) { - logger.warn("Unable to send: {}", cmd.toString()); + logger.warn("Unable to send: {}", cmd); } catch (final Exception e) { - logger.warn("Unable to send: {} due to exception: {}", cmd.toString(), e); + logger.warn("Unable to send: {} due to exception: {}", cmd, e); } logger.debug("Sending shutdown to management server"); try { @@ -376,10 +371,6 @@ public void stop(final String reason, final String detail) { serverResource = null; } - if (startupTask.get() != null) { - startupTask.set(null); - } - if (outRequestHandler != null) { outRequestHandler.shutdownNow(); outRequestHandler = null; @@ -390,11 +381,6 @@ public void stop(final String reason, final String detail) { requestHandler = null; } - if (selfTaskExecutor != null) { - selfTaskExecutor.shutdown(); - selfTaskExecutor = null; - } - if (hostLbCheckExecutor != null) { hostLbCheckExecutor.shutdown(); hostLbCheckExecutor = null; @@ -468,24 +454,21 @@ private void scheduleHostLBCheckerTask(final String lbAlgorithm, final long chec } } if (checkInterval > 0L) { + hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name))); if ("shuffle".equalsIgnoreCase(lbAlgorithm)) { logger.info("Scheduling the preferred host checker task to trigger once (to apply lb algorithm '{}') after host.lb.interval={} ms", lbAlgorithm, checkInterval); - hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name))); hostLbCheckExecutor.schedule(new PreferredHostCheckerTask(), checkInterval, TimeUnit.MILLISECONDS); - return; + } else { + logger.info("Scheduling a recurring preferred host checker task with host.lb.interval={} ms", checkInterval); + hostLbCheckExecutor.scheduleAtFixedRate(new PreferredHostCheckerTask(), checkInterval, checkInterval, TimeUnit.MILLISECONDS); } - - logger.info("Scheduling a recurring preferred host checker task with host.lb.interval={} ms", checkInterval); - hostLbCheckExecutor = Executors.newSingleThreadScheduledExecutor((new NamedThreadFactory(name))); - hostLbCheckExecutor.scheduleAtFixedRate(new PreferredHostCheckerTask(), checkInterval, checkInterval, - TimeUnit.MILLISECONDS); } } public void scheduleWatch(final Link link, final Request request, final long delay, final long period) { logger.debug("Adding a watch list"); final WatchTask task = new WatchTask(link, request, this); - final ScheduledFuture future = selfTaskExecutor.scheduleAtFixedRate(task, delay, period, TimeUnit.MILLISECONDS); + final ScheduledFuture future = watchTaskExecutor.scheduleAtFixedRate(task, delay, period, TimeUnit.MILLISECONDS); watchList.add(future); } @@ -498,7 +481,7 @@ public void triggerUpdate() { try { link.send(request.toBytes()); } catch (final ClosedChannelException e) { - logger.warn("Unable to send ping update: {}", request.toString()); + logger.warn("Unable to send ping update: {}", request); } } @@ -523,64 +506,6 @@ protected void cleanupAgentZoneProperties() { shell.setPersistentProperty(null, "pod", ""); } - public void lockStartupTask(final Link link) { - logger.debug("Creating startup task for link: {}", () -> getLinkLog(link)); - StartupTask currentTask = startupTask.get(); - if (currentTask != null) { - logger.warn("A Startup task is already locked or in progress, cannot create for link {}", - getLinkLog(link)); - return; - } - currentTask = new StartupTask(link); - if (startupTask.compareAndSet(null, currentTask)) { - selfTaskExecutor.schedule(currentTask, startupWait, TimeUnit.SECONDS); - return; - } - logger.warn("Failed to lock a StartupTask for link: {}", getLinkLog(link)); - } - - protected boolean cancelStartupTask() { - StartupTask task = startupTask.getAndSet(null); - if (task != null) { - task.cancel(); - return true; - } - return false; - } - - public void sendStartup(final Link link) { - sendStartup(link, false); - } - - public void sendStartup(final Link link, boolean transfer) { - final StartupCommand[] startup = serverResource.initialize(); - if (startup != null) { - final String msHostList = shell.getPersistentProperty(null, "host"); - final Command[] commands = new Command[startup.length]; - for (int i = 0; i < startup.length; i++) { - setupStartupCommand(startup[i]); - startup[i].setMSHostList(msHostList); - startup[i].setConnectionTransferred(transfer); - commands[i] = startup[i]; - } - final Request request = new Request(id != null ? id : -1, -1, commands, false, false); - request.setSequence(getNextSequence()); - - logger.debug("Sending Startup: {}", request.toString()); - lockStartupTask(link); - try { - link.send(request.toBytes()); - } catch (final ClosedChannelException e) { - logger.warn("Unable to send request to {} due to '{}', request: {}", - getLinkLog(link), e.getMessage(), request); - } - - if (serverResource instanceof ResourceStatusUpdater) { - ((ResourceStatusUpdater) serverResource).registerStatusUpdater(this); - } - } - } - protected String retrieveHostname() { logger.trace("Retrieving hostname with resource={}", () -> serverResource.getClass().getSimpleName()); final String result = Script.runSimpleBashScript(Script.getExecutableAbsolutePath("hostname"), 500); @@ -596,6 +521,17 @@ protected String retrieveHostname() { } } + protected AgentConnectStatusCommand setupAgentConnectStatusCommand(AgentConnectStatusCommand command) { + // there is requirement in Response class to have id as long, to avoid null pointer during boxing, let's set -1 here + command.setHostId(Optional.ofNullable(getId()).orElse(-1L)); + if (StringUtils.isBlank(hostname)) { + hostname = retrieveHostname(); + } + command.setHostName(hostname); + command.setHostGuid(getResourceGuid()); + return command; + } + protected void setupStartupCommand(final StartupCommand startup) { startup.setId(getId()); if (StringUtils.isBlank(startup.getName())) { @@ -623,98 +559,351 @@ public Task create(final Task.Type type, final Link link, final byte[] data) { return new ServerHandler(type, link, data); } - protected void reconnect(final Link link) { - reconnect(link, null, false); + protected void closeAndTerminateLink(Link link) { + Optional.ofNullable(link) + .map(Link::attachment) + .filter(ServerAttache.class::isInstance) + .map(ServerAttache.class::cast) + .ifPresentOrElse(ServerAttache::disconnect, () -> { + if (link != null) { + link.close(); + link.terminated(); + } + }); } - protected void reconnect(final Link link, String preferredMSHost, boolean forTransfer) { - if (!(forTransfer || reconnectAllowed)) { - logger.debug("Reconnect requested but it is not allowed {}", () -> getLinkLog(link)); + protected void stopAndCleanupConnection() { + if (connection == null) { return; } - cancelStartupTask(); - closeAndTerminateLink(link); - closeAndTerminateLink(this.link); - setLink(null); - cancelTasks(); - serverResource.disconnected(); - logger.info("Lost connection to host: {}. Attempting reconnection while we still have {} commands in progress.", shell.getConnectedHost(), commandsInProgress.get()); - stopAndCleanupConnection(true); - String host = preferredMSHost; - if (org.apache.commons.lang3.StringUtils.isBlank(host)) { - host = shell.getNextHost(); - } - List avoidMSHostList = shell.getAvoidHosts(); - do { - if (CollectionUtils.isEmpty(avoidMSHostList) || !avoidMSHostList.contains(host)) { - connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), shell.getSslHandshakeTimeout(), this); - logger.info("Reconnecting to host: {}", host); - try { - connection.start(); - } catch (final NioConnectionException e) { - logger.info("Attempted to re-connect to the server, but received an unexpected exception, trying again...", e); - stopAndCleanupConnection(false); - } + NioConnection connection = this.connection; + connection.stop(); + try { + connection.cleanUp(); + } catch (final IOException e) { + logger.warn("Fail to clean up old connection", e); + } + + try { + while (connection.isStartup()) { + logger.debug("Waiting for connection graceful stop"); + shell.getBackoffAlgorithm().waitBeforeRetry(); + connection.stop(); } - shell.getBackoffAlgorithm().waitBeforeRetry(); - host = shell.getNextHost(); - } while (!connection.isStartup()); - shell.updateConnectedHost(((NioClient)connection).getHost()); - logger.info("Connected to the host: {}", shell.getConnectedHost()); + } catch (Exception e) { + logger.warn("Failed to gracefully stop connection", e); + } + logger.debug("Connection stopped"); + } + + /** + * Select the host to reconnect to based on priority: + * 1. preferredHost if defined and not blank + * 2. Link's socket address IP if available and not null + * 3. shell.getNextHost() if the above two options are not met + * + * @param preferredHost the preferred host to connect to + * @param link the current link which may contain socket address information + * @return the host to connect to + */ + protected String selectReconnectionHost(String preferredHost, Link link) { + return Optional.ofNullable(preferredHost) + .filter(org.apache.commons.lang3.StringUtils::isNotBlank) + .orElseGet(() -> Optional.ofNullable(link) + .map(Link::getSocketAddress) + .map(InetSocketAddress::getAddress) + .map(InetAddress::getHostAddress) + .orElseGet(shell::getNextHost)); } - protected void closeAndTerminateLink(final Link link) { - if (link == null) { + /** + * Reconnect to Management Server. + * + * @param link - connection holder + * @param preferredHost - if defined, reconnect will be performed to this Host first, + * otherwise will be used {@link IAgentShell#getNextHost()} + * @param forceReconnect - expected to be true if called by {@link MigrateAgentConnectionCommand}, + * this is only "switch Management Server", it does not perform full Host Connect process. + */ + protected void reconnect(Link link, String preferredHost, boolean forceReconnect) { + if (!RECONNECT_LOCK.compareAndSet(false, true)) { + logger.warn("Reconnect is already running, exiting"); return; } - link.close(); - link.terminated(); + String requestedLink = Optional.ofNullable(link).map(Link::toString).orElse("N/A"); + String currentLink = Optional.ofNullable(this.link).map(Link::toString).orElse("N/A"); + logger.info("Reconnect info: provided link: {}, agent link: {}, preferred host: {}, force" + + " reconnect: {}", requestedLink, currentLink, preferredHost, forceReconnect); + + try { + logger.debug("Obtained reconnect lock"); + if (!(forceReconnect || reconnectAllowed)) { + if (logger.isDebugEnabled()) { + logger.debug("Reconnect requested but it is not allowed {}", link); + } + return; + } + + if (isReconnectStormDetected(link, preferredHost, requestedLink, currentLink)) { + return; + } + + cleanupConnectionBeforeReconnect(link); + // start with preferred host + String host = selectReconnectionHost(preferredHost, link); + + String hostLog = LogUtils.getHostLog(host, shell.getPort()); + List avoidMsHostList = Optional.ofNullable(shell.getAvoidHosts()).orElseGet(List::of); + // pointer to the first element of "refuse loop" + AtomicReference firstRefuseLoopHostRef = new AtomicReference<>(null); + // to break deadlock where "non-avoid" MS Hosts are down and only "avoid" are up + AtomicBoolean ignoreAvoidMsHostListRef = new AtomicBoolean(false); + do { + AtomicBoolean skipTimeoutRef = new AtomicBoolean(false); + String parentLogContextId = (String) ThreadContext.get("logcontextid"); + if (parentLogContextId != null) { + ThreadContext.put("logcontextid-parent", parentLogContextId); + } + ThreadContext.put("logcontextid", UuidUtils.first(UUID.randomUUID().toString())); + if (ignoreAvoidMsHostListRef.get() || !avoidMsHostList.contains(host)) { + connection = new NioClient(getAgentName(), host, shell.getPort(), shell.getWorkers(), + shell.getSslHandshakeTimeout(), this); + logger.info("Reconnecting to host: {}", hostLog); + try { + connection.start(); + // successfully connected, skip the rest + continue; + } catch (Exception e) { + logReconnectionFailure(e, hostLog); + + try { + stopAndCleanupConnection(); + } catch (Exception ex) { + logger.warn("Got an exception during stop and cleanup connection", e); + } + + updateRefuseLoopState(e, host, firstRefuseLoopHostRef, ignoreAvoidMsHostListRef, skipTimeoutRef); + } + } else { + logger.debug("Next host {} is in avoid list, skipped", hostLog); + if (org.apache.commons.lang3.StringUtils.isBlank(preferredHost)) { + logHostLists(avoidMsHostList); + skipTimeoutRef.set(true); + } + } + if(!skipTimeoutRef.get()) { + shell.getBackoffAlgorithm().waitBeforeRetry(); + } + host = shell.getNextHost(); + hostLog = LogUtils.getHostLog(host, shell.getPort()); + logger.debug("Next host to connect: {}", hostLog); + } while (!connection.isStartup()); + // successfully connected + shell.updateConnectedHost(((NioClient) connection).getHost()); + String msg = String.format("Connected to the host: %s (%s)", shell.getConnectedHost(), this.link); + logger.info(msg); + } finally { + RECONNECT_LOCK.set(false); + logger.debug("Removed reconnect lock"); + } } - protected void stopAndCleanupConnection(boolean waitForStop) { - if (connection == null) { - return; + /** + * Handles "Connection refused" loop detection and determines if backoff timeout should be skipped. + * Manages refuse loop state to detect when all management servers have been tried and need + * to ignore avoid list to prevent deadlock. + * + * @param e the exception from connection attempt + * @param host the current host being attempted + * @param firstRefuseLoopHostRef reference to first host in refuse loop (modified by this method) + * @param ignoreAvoidMsHostListRef flag to ignore avoid list (modified by this method) + * @return true if timeout should be skipped (connection refused), false otherwise + */ + private void updateRefuseLoopState(Exception e, String host, AtomicReference firstRefuseLoopHostRef, AtomicBoolean ignoreAvoidMsHostListRef, AtomicBoolean skipTimeoutRef) { + // we are skipping timeout for "Connection refused" to not waste time on down MS + boolean skipTimeout = Optional.ofNullable(e.getCause()) + .filter(ConnectException.class::isInstance) + .map(Throwable::getMessage) + .filter(CONNECTION_REFUSED_MSG::equalsIgnoreCase) + .isPresent(); + skipTimeoutRef.set(skipTimeout); + String firstRefuseLoopHost = firstRefuseLoopHostRef.get(); + // for each "Connection refused" (maybe need to have a copy of variable with better name) + // start "refuse loop" + if (skipTimeout && firstRefuseLoopHost == null) { + firstRefuseLoopHostRef.set(host); + ignoreAvoidMsHostListRef.set(false); + logger.debug("Started refuse loop for host {}", firstRefuseLoopHost); + // closed "refuse loop" + } else if (skipTimeout && firstRefuseLoopHost.equalsIgnoreCase(host)) { + ignoreAvoidMsHostListRef.set(true); + logger.debug("Closed refuse loop for host {}", firstRefuseLoopHost); + // got non "refuse" related issue, break "refuse loop" + } else if (!skipTimeout && (firstRefuseLoopHostRef != null || ignoreAvoidMsHostListRef.get())) { + logger.debug("Broke refuse loop for host {} by {}", firstRefuseLoopHost, host); + firstRefuseLoopHostRef.set(null); + ignoreAvoidMsHostListRef.set(false); } - connection.stop(); + } + + /** + * Logs reconnection failure with appropriate level based on rejection reason. + * If connection was rejected due to max concurrent connections limit (Broken pipe), + * logs as warning. Otherwise logs as info. + * + * @param e the exception that occurred during reconnection attempt + * @param hostLog the formatted host log string for logging + */ + private void logReconnectionFailure(Exception e, String hostLog) { + // check if got NIO Connection exception, caused by IO Exception "Broken pipe" + boolean rejectedByMs = Optional.of(e).filter(NioConnectionException.class::isInstance) + .map(Exception::getCause) + .filter(IOException.class::isInstance) + .map(IOException.class::cast) + .map(IOException::getMessage) + .filter(BROKEN_PIPE_MSG::equalsIgnoreCase) + .isPresent(); + if(rejectedByMs) { + logger.warn("Attempted to re-connect to {}, but rejected" + + " due to 'agent.max.concurrent.new.connections' reached limit," + + " will try again", hostLog, e); + } else { + logger.info("Attempted to re-connect to {}, but got exception," + + " will try again", hostLog, e); + } + } + + /** + * Logs all management server host lists for debugging reconnection logic. + * Outputs defined hosts, hosts to avoid, and calculated available hosts. + * + * @param avoidMsHostList list of management server hosts to avoid during reconnection + */ + private void logHostLists(List avoidMsHostList) { + logger.debug("Preferred host is not defined"); try { - connection.cleanUp(); - } catch (final IOException e) { - logger.warn("Fail to clean up old connection. {}", e); + List hostsList = Optional.ofNullable(shell.getHosts()) + .map(Arrays::asList) + .orElseGet(List::of); + + List hostsShortList = new ArrayList<>(hostsList); + hostsShortList.removeAll(avoidMsHostList); + + logger.info("Defined hosts: {} Avoid hosts: {} Available hosts: {}", + String.join(", ", hostsList), String.join(", ", avoidMsHostList), String.join(", ", hostsShortList)); + } catch (Exception e) { + logger.warn("Failed to calculate next host logic", e); } - if (!waitForStop) { - return; + } + + /** + * Cleans up current connection state before attempting reconnection. + * Stops host connect process, terminates links, cancels scheduled tasks, + * notifies server resource about disconnection, and resets connection tracking. + * + * @param link the link that triggered reconnection + */ + private void cleanupConnectionBeforeReconnect(Link link) { + String lastConnectedHost = shell.getConnectedHost(); + try { + // reset Host status track and Startup process initiating + logger.debug("Stopping Host Connect process"); + hostConnectProcess.stop(); + closeAndTerminateLink(link); + closeAndTerminateLink(this.link); + setLink(null); + cancelTasks(); + serverResource.disconnected(); + stopAndCleanupConnection(); + shell.updateConnectedHost(null); + } catch (Exception ex) { + logger.error("Failed to cleanup previous connection", ex); + } + logger.info("Lost connection to host: {}. Attempting reconnection while we still have" + + " {} commands in progress.", lastConnectedHost, commandsInProgress.get()); + } + + /** + * Detects reconnection storm by checking if the reconnect request is redundant. + * This prevents processing stale reconnection requests for old links when + * agent has already established a new connection. + * + * @param link the link requesting reconnection + * @param preferredHost the preferred host to reconnect to (may be null) + * @param requestedLink string representation of the requested link for logging + * @param currentLink string representation of the current agent link for logging + * @return true if reconnection storm is detected and request should be skipped, false otherwise + */ + private boolean isReconnectStormDetected(Link link, String preferredHost, String requestedLink, String currentLink) { + logger.debug("Calling storm guard"); + boolean reconnectForCurrentLink = link == this.link; + boolean currentLinkTerminated = this.link != null && this.link.isTerminated(); + boolean reconnectForNewHost = this.hostname != null && this.hostname.equals(preferredHost); + // if none of the above is true + boolean stormDetected = ! (reconnectForCurrentLink || currentLinkTerminated || reconnectForNewHost); + // connection storm guard + if (stormDetected) { + logger.warn("Reconnect requested for the connection {} but current connection is " + + "{} and preferred host {}, skipping", requestedLink, !currentLinkTerminated ? currentLink : currentLink + " (terminated)", preferredHost); } - do { - shell.getBackoffAlgorithm().waitBeforeRetry(); - } while (connection.isStartup()); + return stormDetected; } - public void processStartupAnswer(final Answer answer, final Response response, final Link link) { - boolean answerValid = cancelStartupTask(); - final StartupAnswer startup = (StartupAnswer)answer; + public void processStartupAnswer(final StartupAnswer startup, final Response response, final Link link) { + setBackoffAlgorithm(startup); + if (!startup.getResult()) { - logger.error("Not allowed to connect to the server: {}", answer.getDetails()); + logger.error("Not allowed to connect to the server: {}", startup.getDetails()); if (serverResource != null && !serverResource.isExitOnFailures()) { logger.trace("{} does not allow exit on failure, reconnecting", serverResource.getClass().getSimpleName()); - reconnect(link); + logger.info("Reconnecting for {}", link); + requestHandler.submit(() -> reconnect(link, null, false)); return; } + logger.fatal("Got unsuccessful result {} from the answer {}, details: {}", + startup.getResult(), startup.getClass().getSimpleName(), startup.getDetails()); System.exit(1); } - if (!answerValid) { + + boolean processWasRunning = hostConnectProcess.stop(); + if (!processWasRunning) { logger.warn("Threw away a startup answer because we're reconnecting."); return; } + handleStartupAnswer(startup, response, link); + } + + private void handleStartupAnswer(StartupAnswer startup, Response response, Link link) { logger.info("Process agent startup answer, agent [id: {}, uuid: {}, name: {}] connected to the server", startup.getHostId(), startup.getHostUuid(), startup.getHostName()); setId(startup.getHostId()); - setUuid(startup.getHostUuid()); - setName(startup.getHostName()); + // older builds do not send host uuid and names, do not set it, otherwise null pointer exception will be thrown + String hostUuid = startup.getHostUuid(); + if (org.apache.commons.lang3.StringUtils.isNotEmpty(hostUuid)) { + setUuid(hostUuid); + } + String hostName = startup.getHostName(); + if (org.apache.commons.lang3.StringUtils.isNotEmpty(hostName)) { + setName(hostUuid); + } pingInterval = startup.getPingInterval() * 1000L; // change to ms. + // Process agent.host.status.check.delay.sec parameter from Management Server + Integer agentHostStatusCheckDelaySec = startup.getAgentHostStatusCheckDelaySec(); + if (agentHostStatusCheckDelaySec != null) { + logger.info("Received agent.host.status.check.delay.sec={} from Management Server", agentHostStatusCheckDelaySec); + try { + shell.setPersistentProperty(null, AgentProperties.AGENT_HOST_STATUS_CHECK_DELAY_SEC.getName(), agentHostStatusCheckDelaySec.toString()); + hostConnectProcess.updateHostStatusCheckDelay(agentHostStatusCheckDelaySec); + logger.info("Updated agent.host.status.check.delay.sec to {} seconds", agentHostStatusCheckDelaySec); + } catch (Exception e) { + logger.warn("Failed to persist agent.host.status.check.delay.sec parameter from StartupAnswer", e); + } + } + updateLastPingResponseTime(); scheduleWatch(link, response, pingInterval, pingInterval); @@ -724,6 +913,17 @@ public void processStartupAnswer(final Answer answer, final Response response, f startup.getHostId(), startup.getHostUuid(), startup.getHostName()); } + private void setBackoffAlgorithm(StartupAnswer startup) { + try { + logger.info("Updating backoff delay algorithm implementation, defined in {}", startup.getClass().getSimpleName()); + shell.setBackoffAlgorithm(BackoffFactory.create(startup.getParams())); + logger.info("Created {} delay algorithm implementation", shell.getBackoffAlgorithm().getClass() + .getSimpleName()); + } catch (Exception e) { + logger.warn("Failed to create backoff with provided settings {}", startup.getParams(), e); + } + } + protected void processRequest(final Request request, final Link link) { boolean requestLogged = false; Response response = null; @@ -739,7 +939,8 @@ protected void processRequest(final Request request, final Link link) { ThreadContext.put("logcontextid", cmd.getContextParam("logid")); } if (logger.isDebugEnabled()) { - if (!requestLogged) // ensures request is logged only once per method call + // ensures request is logged only once per method call + if (!requestLogged) { final String requestMsg = request.toString(); if (requestMsg != null) { @@ -747,15 +948,15 @@ protected void processRequest(final Request request, final Link link) { } requestLogged = true; } - logger.debug("Processing command: {}", cmd.toString()); + logger.debug("Processing command: {}", cmd); } if (cmd instanceof CronCommand) { - final CronCommand watch = (CronCommand)cmd; + final CronCommand watch = (CronCommand) cmd; scheduleWatch(link, request, watch.getInterval() * 1000L, watch.getInterval() * 1000L); answer = new Answer(cmd, true, null); } else if (cmd instanceof ShutdownCommand) { - final ShutdownCommand shutdown = (ShutdownCommand)cmd; + final ShutdownCommand shutdown = (ShutdownCommand) cmd; logger.debug("Received shutdownCommand, due to: {}", shutdown.getReason()); cancelTasks(); if (shutdown.isRemoveHost()) { @@ -763,31 +964,34 @@ protected void processRequest(final Request request, final Link link) { } reconnectAllowed = false; answer = new Answer(cmd, true, null); - } else if (cmd instanceof ReadyCommand && ((ReadyCommand)cmd).getDetails() != null) { - - logger.debug("Not ready to connect to mgt server: {}", ((ReadyCommand)cmd).getDetails()); + } else if (cmd instanceof ReadyCommand && ((ReadyCommand) cmd).getDetails() != null) { + ReadyCommand readyCmd = (ReadyCommand) cmd; + logger.debug("Not ready to connect to mgt server: {}", readyCmd.getDetails()); if (serverResource != null && !serverResource.isExitOnFailures()) { logger.trace("{} does not allow exit on failure, reconnecting", serverResource.getClass().getSimpleName()); - reconnect(link); + logger.info("Reconnecting for {}", link); + requestHandler.submit(() -> reconnect(link, null, false)); return; } - System.exit(1); + logger.fatal("Got unsuccessful from the answer {}, details: {}", + readyCmd.getClass().getSimpleName(), readyCmd.getDetails()); + System.exit(3); return; } else if (cmd instanceof MaintainCommand) { logger.debug("Received maintainCommand, do not cancel current tasks"); - answer = new MaintainAnswer((MaintainCommand)cmd); + answer = new MaintainAnswer((MaintainCommand) cmd); } else if (cmd instanceof AgentControlCommand) { answer = null; for (final IAgentControlListener listener : controlListeners) { - answer = listener.processControlRequest(request, (AgentControlCommand)cmd); + answer = listener.processControlRequest(request, (AgentControlCommand) cmd); if (answer != null) { break; } } if (answer == null) { - logger.warn("No handler found to process cmd: {}", cmd.toString()); + logger.warn("No handler found to process command: {}", cmd); answer = new AgentControlAnswer(cmd); } } else if (cmd instanceof SetupKeyStoreCommand && ((SetupKeyStoreCommand) cmd).isHandleByAgent()) { @@ -815,7 +1019,7 @@ protected void processRequest(final Request request, final Link link) { commandsInProgress.decrementAndGet(); } if (answer == null) { - logger.debug("Response: unsupported command {}", cmd.toString()); + logger.debug("Response: unsupported command {}", cmd); answer = Answer.createUnsupportedCommandAnswer(cmd); } } @@ -846,8 +1050,8 @@ protected void processRequest(final Request request, final Link link) { if (response != null) { try { link.send(response.toBytes()); - } catch (final ClosedChannelException e) { - logger.warn("Unable to send response: {}", response.toString()); + } catch (final Exception e) { + logger.warn("Unable to send response {}: {} ", response, e.getMessage()); } } } @@ -992,60 +1196,71 @@ private void migrateAgentConnection(List avoidMsList) { throw new CloudRuntimeException("No other Management Server hosts to migrate"); } - String preferredMSHost = null; + String preferredMSHost = null; for (String msHost : msHostsList) { try (final Socket socket = new Socket()) { - socket.connect(new InetSocketAddress(msHost, shell.getPort()), 5000); + logger.debug("Connecting to {}:{} with timeout {} ms", msHost, shell.getPort(), HOST_SOCKET_CONNECT_TIMEOUT_MS); + socket.connect(new InetSocketAddress(msHost, shell.getPort()), HOST_SOCKET_CONNECT_TIMEOUT_MS); + logger.debug("Connected to {}:{} with timeout {} ms", msHost, shell.getPort(), HOST_SOCKET_CONNECT_TIMEOUT_MS); preferredMSHost = msHost; break; } catch (final IOException e) { - throw new CloudRuntimeException("Management server host: " + msHost + " is not reachable, to migrate connection"); + throw new CloudRuntimeException(String.format("Management server host: %s:%s is not reachable, to migrate connection", msHost, shell.getPort())); } } if (preferredMSHost == null) { - throw new CloudRuntimeException("Management server host(s) are not reachable, to migrate connection"); + throw new CloudRuntimeException(String.format("Management server host(s) %s are not reachable, to migrate connection", msHostsList)); } - logger.debug("Management server host " + preferredMSHost + " is found to be reachable, trying to reconnect"); + logger.debug("Management server host {} is found to be reachable, trying to reconnect", preferredMSHost); shell.resetHostCounter(); shell.setAvoidHosts(avoidMsList); shell.setConnectionTransfer(true); - reconnect(link, preferredMSHost, true); + + String preferredMSHostFinal = preferredMSHost; + logger.info("Reconnecting to {}", link); + requestHandler.submit(() -> reconnect(link, preferredMSHostFinal, true)); } public void processResponse(final Response response, final Link link) { final Answer answer = response.getAnswer(); - logger.debug("Received response: {}", response.toString()); + logger.debug("Received response: {}", response); if (answer instanceof StartupAnswer) { - processStartupAnswer(answer, response, link); + processStartupAnswer((StartupAnswer) answer, response, link); } else if (answer instanceof AgentControlAnswer) { // Notice, we are doing callback while holding a lock! for (final IAgentControlListener listener : controlListeners) { - listener.processControlResponse(response, (AgentControlAnswer)answer); + listener.processControlResponse(response, (AgentControlAnswer) answer); } } else if (answer instanceof PingAnswer) { - processPingAnswer((PingAnswer) answer); + processPingAnswer(link, (PingAnswer) answer); } else { + ServerAttache attachment = (ServerAttache) link.attachment(); + if (attachment != null) { + attachment.processAnswers(response.getSequence(), response); + } else { + logger.trace("No attachments in the link, nothing to process"); + } updateLastPingResponseTime(); } } - private void processPingAnswer(final PingAnswer answer) { + private void processPingAnswer(final Link link, final PingAnswer answer) { if ((answer.isSendStartup()) && reconnectAllowed) { logger.info("Management server requested startup command to reinitialize the agent"); - sendStartup(link); + hostConnectProcess.scheduleConnectProcess(link, false); } else { - serverResource.processPingAnswer((PingAnswer) answer); + serverResource.processPingAnswer(answer); } shell.setAvoidHosts(answer.getAvoidMsList()); } public void processReadyCommand(final Command cmd) { - final ReadyCommand ready = (ReadyCommand)cmd; + final ReadyCommand ready = (ReadyCommand) cmd; // Set human readable sizes; Boolean humanReadable = ready.getEnableHumanReadableSizes(); - if (humanReadable != null){ + if (humanReadable != null) { NumbersUtil.enableHumanReadableSizes = humanReadable; } @@ -1083,7 +1298,7 @@ public void processOtherTask(final Task task) { final PingCommand ping = serverResource.getCurrentStatus(getId()); final Request request = new Request(id, -1, ping, false); request.setSequence(getNextSequence()); - logger.debug("Sending ping: {}", request.toString()); + logger.debug("Sending ping: {}", request); try { task.getLink().send(request.toBytes()); @@ -1091,11 +1306,11 @@ public void processOtherTask(final Task task) { updateLastPingResponseTime(); } catch (final ClosedChannelException e) { logger.warn("Unable to send request to {} due to '{}', request: {}", - getLinkLog(task.getLink()), e.getMessage(), request); + task.getLink(), e.getMessage(), request); } } else if (obj instanceof Request) { - final Request req = (Request)obj; + final Request req = (Request) obj; final Command command = req.getCommand(); if (command.getContextParam("logid") != null) { ThreadContext.put("logcontextid", command.getContextParam("logid")); @@ -1113,11 +1328,11 @@ public void processOtherTask(final Task task) { if (answer != null) { final Response response = new Response(req, answer); - logger.debug("Watch Sent: {}", response.toString()); + logger.debug("Watch Sent: {}", response); try { task.getLink().send(response.toBytes()); } catch (final ClosedChannelException e) { - logger.warn("Unable to send response: {}", response.toString()); + logger.warn("Unable to send response: {}", response); } } } else { @@ -1145,7 +1360,7 @@ public void unregisterControlListener(final IAgentControlListener listener) { @Override public AgentControlAnswer sendRequest(final AgentControlCommand cmd, final int timeoutInMilliseconds) throws AgentControlChannelException { - final Request request = new Request(getId(), -1, new Command[] {cmd}, true, false); + final Request request = new Request(getId(), -1, new Command[]{cmd}, true, false); request.setSequence(getNextSequence()); final AgentControlListener listener = new AgentControlListener(request); registerControlListener(listener); @@ -1166,7 +1381,7 @@ public AgentControlAnswer sendRequest(final AgentControlCommand cmd, final int t @Override public void postRequest(final AgentControlCommand cmd) throws AgentControlChannelException { - final Request request = new Request(getId(), -1, new Command[] {cmd}, true, false); + final Request request = new Request(getId(), -1, new Command[]{cmd}, true, false); request.setSequence(getNextSequence()); postRequest(request); } @@ -1176,7 +1391,7 @@ private void postRequest(final Request request) throws AgentControlChannelExcept try { link.send(request.toBytes()); } catch (final ClosedChannelException e) { - logger.warn("Unable to post agent control request: {}", request.toString()); + logger.warn("Unable to post agent control request: {}", request, e); throw new AgentControlChannelException("Unable to post agent control request due to " + e.getMessage()); } } else { @@ -1253,44 +1468,14 @@ public void run() { } } - public class StartupTask implements Runnable { - protected Link link; - private final AtomicBoolean cancelled = new AtomicBoolean(false); - - public StartupTask(final Link link) { - logger.debug("Startup task created"); - this.link = link; - } - - public boolean cancel() { - // TimerTask.cancel may fail depends on the calling context - if (cancelled.compareAndSet(false, true)) { - startupWait = DEFAULT_STARTUP_WAIT; - logger.debug("Startup task cancelled"); - } - return true; - } - - @Override - public void run() { - if (cancelled.compareAndSet(false, true)) { - logger.info("The running startup command is now invalid. Attempting reconnect"); - startupTask.set(null); - startupWait = DEFAULT_STARTUP_WAIT * 2; - logger.debug("Executing reconnect from task - {}", () -> getLinkLog(link)); - reconnect(link); - } - } - } - public class AgentRequestHandler extends Task { public AgentRequestHandler(final Task.Type type, final Link link, final Request req) { super(type, link, req); } @Override - protected void doTask(final Task task) throws TaskExecutionException { - final Request req = (Request)get(); + protected void doTask(final Task task) { + final Request req = (Request) get(); if (!(req instanceof Response)) { processRequest(req, task.getLink()); } @@ -1307,11 +1492,13 @@ public ServerHandler(final Task.Type type, final Link link, final Request req) { } @Override - public void doTask(final Task task) throws TaskExecutionException { + public void doTask(final Task task) { if (task.getType() == Task.Type.CONNECT) { shell.getBackoffAlgorithm().reset(); setLink(task.getLink()); - sendStartup(task.getLink(), shell.isConnectionTransfer()); + Link link = task.getLink(); + link.attach(new ServerAttache(link)); + hostConnectProcess.scheduleConnectProcess(task.getLink(), shell.isConnectionTransfer()); shell.setConnectionTransfer(false); } else if (task.getType() == Task.Type.DATA) { Request request; @@ -1319,9 +1506,10 @@ public void doTask(final Task task) throws TaskExecutionException { request = Request.parse(task.getData()); if (request instanceof Response) { //It's for pinganswer etc, should be processed immediately. - processResponse((Response)request, task.getLink()); + processResponse((Response) request, task.getLink()); } else { - //put the requests from mgt server into another thread pool, as the request may take a longer time to finish. Don't block the NIO main thread pool + //put the requests from mgt server into another thread pool, as the request may take a longer + // time to finish. Don't block the NIO main thread pool requestHandler.submit(new AgentRequestHandler(getType(), getLink(), request)); } } catch (final ClassNotFoundException e) { @@ -1333,13 +1521,13 @@ public void doTask(final Task task) throws TaskExecutionException { try { // an issue has been found if reconnect immediately after disconnecting. // wait 5 seconds before reconnecting - logger.debug("Wait for 5 secs before reconnecting, disconnect task - {}", () -> getLinkLog(task.getLink())); + logger.debug("Wait for 5 secs before reconnecting, disconnect task - {}", task.getLink()); Thread.sleep(5000); } catch (InterruptedException e) { } shell.setConnectionTransfer(false); - logger.debug("Executing disconnect task - {} and reconnecting", () -> getLinkLog(task.getLink())); - reconnect(task.getLink()); + logger.debug("Executing disconnect task - {} and reconnecting", task.getLink()); + requestHandler.submit(() -> reconnect(task.getLink(), null, false)); } else if (task.getType() == Task.Type.OTHER) { processOtherTask(task); } @@ -1402,35 +1590,125 @@ public class PreferredHostCheckerTask extends ManagedContextTimerTask { @Override protected void runInContext() { + logger.debug("Running preferred host checker task"); try { - final String[] msList = shell.getHosts(); - if (msList == null || msList.length < 1) { + String preferredMSHost = Optional.ofNullable(shell.getHosts()) + .filter(ArrayUtils::isNotEmpty) + .map(hosts -> hosts[0]) + .orElse(null); + + if (StringUtils.isBlank(preferredMSHost)) { + logger.debug("Preferred host is not defined, skip task"); + return; + } + + List avoidHosts = Optional.ofNullable(shell.getAvoidHosts()).orElseGet(List::of); + if (avoidHosts.contains(preferredMSHost)) { + logger.debug("Preferred host {} is in avoid list {}, skip task", + preferredMSHost, String.join(", ", avoidHosts)); return; } - final String preferredMSHost = msList[0]; - final String connectedHost = shell.getConnectedHost(); - logger.debug("Running preferred host checker task, connected host={}, preferred host={}", - connectedHost, preferredMSHost); - if (preferredMSHost == null || preferredMSHost.equals(connectedHost) || link == null) { + String connectedHost = shell.getConnectedHost(); + String connectedIp = NetUtils.resolveToIp(connectedHost); + String preferredIp = NetUtils.resolveToIp(preferredMSHost); + if (Objects.equals(connectedIp, preferredIp) || Objects.equals(connectedHost, preferredMSHost)) { + logger.debug(String.format("Already connected to the preferred host (%s), skip task", + preferredMSHost)); return; } - boolean isHostUp = false; - try (final Socket socket = new Socket()) { - socket.connect(new InetSocketAddress(preferredMSHost, shell.getPort()), 5000); + if (link == null) { + logger.debug("Agent is not connected to any host now, skip task"); + return; + } + + if (commandsInProgress.get() > 0) { + logger.debug("There are still {} commands in progress, skip task", commandsInProgress.get()); + return; + } + + if (hostConnectProcess.isInProgress()) { + logger.warn("Unable to reconnect because there is already running Host connect process on {}, skip task", + shell.getConnectedHost()); + return; + } + + logger.debug("Running preferred host checker task, preferred host: {} connected host: {}", + preferredMSHost, connectedHost); + + boolean isHostUp; + try (Socket socket = new Socket()) { + socket.connect(new InetSocketAddress(preferredMSHost, shell.getPort()), HOST_SOCKET_CONNECT_TIMEOUT_MS); isHostUp = true; - } catch (final IOException e) { + } catch (IOException e) { + isHostUp = false; logger.debug("Host: {} is not reachable", preferredMSHost); } - if (isHostUp && link != null && commandsInProgress.get() == 0) { + if (isHostUp && link != null && commandsInProgress.get() == 0 && isLockAvailable()) { if (logger.isDebugEnabled()) { logger.debug("Preferred host {} is found to be reachable, trying to reconnect", preferredMSHost); } shell.resetHostCounter(); + logger.info("Reconnecting to {}", link); reconnect(link, preferredMSHost, false); } } catch (Throwable t) { logger.error("Error caught while attempting to connect to preferred host", t); } } + + /** + * Sends {@link AgentConnectStatusCommand} to the currently connected Management Server + * and checks whether {@link AgentConnectStatusAnswer#isLockAvailable()}. + * + * @return true if lock available + */ + private boolean isLockAvailable() { + boolean lockAvailable = true; + ServerAttache attache = (ServerAttache) link.attachment(); + if (attache != null && attache.getLink() != null) { + AgentConnectStatusCommand command = setupAgentConnectStatusCommand(new AgentConnectStatusCommand()); + Command[] commands = new Command[]{command}; + try { + AgentConnectStatusAnswer answer = attache.send(getId(), commands, AgentConnectStatusAnswer.class, + DEFAULT_ASYNC_COMMAND_TIMEOUT_SEC); + lockAvailable = Optional.ofNullable(answer) + .map(AgentConnectStatusAnswer::isLockAvailable) + .orElse(Boolean.FALSE); + if (!lockAvailable) { + logger.info(String.format("There is lock and Host status is %s, will retry later", + answer.getHostStatus())); + } + } catch (IOException | RuntimeException e) { + String commandName = commands[0].getClass().getSimpleName(); + logger.error(String.format("Failed to retrieve %s, will retry later", commandName), e); + } + } + return lockAvailable; + } + } + + @Override + public int getMaxConcurrentNewConnectionsCount() { + return shell.getWorkers(); + } + + @Override + public int getNewConnectionsCount() { + logger.trace("Get new connections count called"); + return 0; + } + + @Override + public void registerNewConnection(InetSocketAddress address) { + logger.trace("Register new connection to {}", address); + } + + @Override + public void unregisterNewConnection(InetSocketAddress address) { + logger.trace("Unregister new connection to {}", address); + } + + public String getPersistentProperty(String name) { + return shell.getPersistentProperty(null, name); } } diff --git a/agent/src/main/java/com/cloud/agent/AgentShell.java b/agent/src/main/java/com/cloud/agent/AgentShell.java index 4862e7e001e3..2a3374a48c39 100644 --- a/agent/src/main/java/com/cloud/agent/AgentShell.java +++ b/agent/src/main/java/com/cloud/agent/AgentShell.java @@ -32,6 +32,7 @@ import javax.naming.ConfigurationException; +import com.cloud.utils.backoff.BackoffFactory; import org.apache.commons.daemon.Daemon; import org.apache.commons.daemon.DaemonContext; import org.apache.commons.daemon.DaemonInitException; @@ -52,7 +53,6 @@ import com.cloud.utils.ProcessUtil; import com.cloud.utils.PropertiesUtil; import com.cloud.utils.backoff.BackoffAlgorithm; -import com.cloud.utils.backoff.impl.ConstantTimeBackoff; import com.cloud.utils.exception.CloudRuntimeException; public class AgentShell implements IAgentShell, Daemon { @@ -95,6 +95,16 @@ public BackoffAlgorithm getBackoffAlgorithm() { return _backoff; } + @Override + public void setBackoffAlgorithm(BackoffAlgorithm backoffAlgorithm) { + this._backoff = backoffAlgorithm; + try { + backoffAlgorithm.getConfiguration().forEach((key, value) -> setPersistentProperty(null, key, value)); + } catch (RuntimeException e) { + LOGGER.warn("Failed to persist backoff properties"); + } + } + @Override public int getPingRetries() { return _pingRetries; @@ -400,7 +410,7 @@ public void init(String[] args) throws ConfigurationException { final Class c = this.getClass(); _version = c.getPackage().getImplementationVersion(); if (_version == null) { - throw new CloudRuntimeException("Unable to find the implementation version of this agent"); + _version = "unknown"; } LOGGER.info("Implementation Version is {}", _version); @@ -410,7 +420,7 @@ public void init(String[] args) throws ConfigurationException { if (LOGGER.isDebugEnabled()) { List properties = Collections.list((Enumeration)_properties.propertyNames()); for (String property : properties) { - LOGGER.debug("Found property: {}", property); + LOGGER.debug("Found property: {}, value: {}", property, _properties.getProperty(property)); } } @@ -424,11 +434,15 @@ public void init(String[] args) throws ConfigurationException { _properties.put(cmdLineProp.getKey(), cmdLineProp.getValue()); } - LOGGER.info("Defaulting to the constant time backoff algorithm"); - _backoff = new ConstantTimeBackoff(); - Map map = new HashMap<>(); - map.put("seconds", _properties.getProperty("backoff.seconds")); - _backoff.configure("ConstantTimeBackoff", map); + try { + LOGGER.info("Creating backoff delay algorithm implementation"); + setBackoffAlgorithm(BackoffFactory.create(_properties)); + LOGGER.info("Created {} delay algorithm implementation", getBackoffAlgorithm().getClass().getName()); + } catch (RuntimeException e) { + String msg = String.format("Failed to create backoff with provided properties %s, failing back to default", _properties); + LOGGER.warn(msg, e); + setBackoffAlgorithm(BackoffFactory.createDefault(_properties)); + } } private void launchAgent() throws ConfigurationException { @@ -546,7 +560,7 @@ public void start() { } } catch (final Exception e) { - LOGGER.error("Unable to start agent: ", e); + LOGGER.error("Unable to start agent: {}", e.getLocalizedMessage(), e); System.exit(ExitStatus.Error.value()); } } @@ -568,6 +582,7 @@ public static void main(String[] args) { shell.init(args); shell.start(); } catch (ConfigurationException e) { + LOGGER.fatal(e.getMessage(), e); System.out.println(e.getMessage()); } } diff --git a/agent/src/main/java/com/cloud/agent/HostConnectProcess.java b/agent/src/main/java/com/cloud/agent/HostConnectProcess.java new file mode 100644 index 000000000000..b9dfa57f11f4 --- /dev/null +++ b/agent/src/main/java/com/cloud/agent/HostConnectProcess.java @@ -0,0 +1,355 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.agent; + +import com.cloud.agent.api.AgentConnectStatusAnswer; +import com.cloud.agent.api.AgentConnectStatusCommand; +import com.cloud.agent.api.Answer; +import com.cloud.agent.api.Command; +import com.cloud.agent.api.StartupAnswer; +import com.cloud.agent.api.StartupCommand; +import com.cloud.agent.properties.AgentProperties; +import com.cloud.agent.properties.AgentPropertiesFileHandler; +import com.cloud.agent.transport.Request; +import com.cloud.exception.CloudException; +import com.cloud.exception.OperationTimedoutException; +import com.cloud.host.Status; +import com.cloud.resource.ResourceStatusUpdater; +import com.cloud.resource.ServerResource; +import com.cloud.utils.concurrency.NamedThreadFactory; +import com.cloud.utils.nio.Link; +import org.apache.cloudstack.threadcontext.ThreadContextUtil; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.ThreadContext; + +import java.io.IOException; +import java.nio.channels.ClosedChannelException; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.ScheduledFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Predicate; + +public class HostConnectProcess { + private static final Logger logger = LogManager.getLogger(HostConnectProcess.class); + + public static final int DEFAULT_ASYNC_COMMAND_TIMEOUT_SEC = + AgentPropertiesFileHandler.getPropertyValue(AgentProperties.ASYNC_COMMAND_TIMEOUT_SEC); + + public static final int DEFAULT_ASYNC_STARTUP_COMMAND_TIMEOUT_SEC = + AgentPropertiesFileHandler.getPropertyValue(AgentProperties.ASYNC_STARTUP_COMMAND_TIMEOUT_SEC); + + static final long HOST_STATUS_CHECK_INITIAL_DELAY_SEC = 10; + private long hostStatusCheckDelaySec = AgentPropertiesFileHandler.getPropertyValue(AgentProperties.AGENT_HOST_STATUS_CHECK_DELAY_SEC); + private final AtomicReference> hostStatusFutureRef = new AtomicReference<>(); + private final Agent agent; + private ScheduledExecutorService hostStatusExecutor; + + public HostConnectProcess(Agent agent) { + this.agent = agent; + initExecutors(); + } + + private void initExecutors() { + stop(); + var threadFactory = new NamedThreadFactory("Agent-" + HostStatusTask.class.getSimpleName()); + hostStatusExecutor = Executors.newScheduledThreadPool(1, threadFactory); + } + + /** + * Stops the whole connect process and cancels all scheduled asynchronous tasks. + * Returns {@link Boolean#TRUE} if {@link HostConnectProcess} was waiting for {@link StartupAnswer}. + */ + public boolean stop() { + logger.debug("Stopping connect process. The process is active: {}", isInProgress()); + stopHostStatusExecutor(); + logger.debug("Stopped executor"); + Optional> hostStatusOpt = Optional.ofNullable(hostStatusFutureRef.getAndSet(null)) + .filter(Predicate.not(ScheduledFuture::isCancelled)); + + hostStatusOpt.ifPresent(future -> future.cancel(true)); + logger.debug("Cancelled future"); + + return hostStatusOpt.isPresent(); + } + + private void stopHostStatusExecutor() { + if (hostStatusExecutor != null) { + hostStatusExecutor.shutdown(); + hostStatusExecutor = null; + } + } + + public void scheduleConnectProcess(Link link, boolean connectionTransfer) { + logger.debug("Scheduling connect process for {}", link); + initExecutors(); + + var task = new HostStatusTask(link, connectionTransfer, agent, hostStatusFutureRef); + var future = hostStatusExecutor.scheduleWithFixedDelay(ThreadContextUtil.wrapThreadContext(task), + HOST_STATUS_CHECK_INITIAL_DELAY_SEC, + hostStatusCheckDelaySec, TimeUnit.SECONDS); + hostStatusFutureRef.set(future); + } + + /** + * Returns {@link Boolean#TRUE} if {@link HostStatusTask} created and scheduled. + * That means there is already {@link Status#Connecting} process is running. + */ + public boolean isInProgress() { + return Optional.ofNullable(hostStatusFutureRef.get()) + .filter(Predicate.not(ScheduledFuture::isCancelled)).isPresent(); + } + + public void updateHostStatusCheckDelay(int newDelaySec) { + logger.info("Updating host status check delay from {} to {} seconds", hostStatusCheckDelaySec, newDelaySec); + this.hostStatusCheckDelaySec = newDelaySec; + } + + /** + * Task wait for the Host to be available to connect to submit {@link StartupCommand}. + * Checks Host status on Management Server cluster and submit {@link StartupCommand} only if there is no lock and + * Host is not {@link Status#Connecting}. + */ + public static class HostStatusTask implements Runnable, AsyncSend { + private final Set operationalStatuses = Set.of(Status.Connecting, Status.Up, Status.Rebalancing); + + private final Link _link; + private final boolean _forceConnect; + private final Agent _agent; + private final AtomicReference> _futureRef; + + public HostStatusTask(Link link, boolean forceConnect, Agent agent, + AtomicReference> futureRef) { + logger.debug("{} created", this.getClass().getSimpleName()); + _link = link; + _forceConnect = forceConnect; + _agent = agent; + _futureRef = futureRef; + } + + private void cancel() { + logger.debug("Cancelling future"); + Optional.ofNullable(_futureRef.get()) + .filter(Predicate.not(ScheduledFuture::isCancelled)) + .ifPresent(future -> future.cancel(true)); + logger.debug("Cancelled future"); + } + + @Override + public void run() { + try { + logger.debug("Running {}", getClass().getSimpleName()); + runInternal(); + } catch (Exception e) { + logger.error("Failed to run {}", getClass().getSimpleName(), e); + } + } + + private void runInternal() { + ServerAttache attache = (ServerAttache) _link.attachment(); + if (attache == null || attache.getLink() == null) { + cancel(); + return; + } + + AgentConnectStatusAnswer answer; + try { + answer = getAgentConnectStatusAnswer(attache); + } catch (IOException e) { + cancel(); + logger.error("The connection to {} interrupted, restarting the whole process", _link, e); + _agent.getRequestHandler().submit(() -> _agent.reconnect(_link, null, _forceConnect)); + return; + } + if (answer == null) { + logger.warn("Received empty agent connect status answer, will retry later"); + return; + } + Boolean lockAvailable = answer.isLockAvailable(); + Status status = answer.getHostStatus(); + if (Boolean.TRUE.equals(lockAvailable)) { + // send startup command here + logger.info("There is no lock and Host status is {}", status); + try { + sendStartupCommand(_link, _forceConnect); + logger.debug("Sending startup command to {} finished", _link); + cancel(); + logger.debug("Unscheduled {}", getClass().getSimpleName()); + } catch (RuntimeException e) { + logger.error("Failed to send startup command to {}", _link, e); + } catch (IOException e) { + cancel(); + logger.error("The connection to {} interrupted, restarting the whole process", _link, e); + _agent.getRequestHandler().submit(() -> _agent.reconnect(_link, null, _forceConnect)); + } + } else { + logger.info("There is lock and Host status is {}, will retry later", status); + } + } + + private AgentConnectStatusAnswer getAgentConnectStatusAnswer(ServerAttache attache) throws IOException { + AgentConnectStatusCommand command = _agent.setupAgentConnectStatusCommand(new AgentConnectStatusCommand()); + var commands = new Command[]{command}; + try { + return send(attache, commands, AgentConnectStatusAnswer.class, DEFAULT_ASYNC_COMMAND_TIMEOUT_SEC); + } catch (RuntimeException e) { + String commandName = commands[0].getClass().getSimpleName(); + logger.error("Failed to retrieve {}, will retry later", commandName, e); + return null; + } + } + + public void sendStartupCommand(Link link, boolean connectionTransfer) throws IOException { + ServerAttache attache = (ServerAttache) link.attachment(); + if (attache == null || attache.getLink() == null) { + return; + } + ServerResource serverResource = _agent.getResource(); + StartupCommand[] startup = serverResource.initialize(); + if (ArrayUtils.isEmpty(startup)) { + logger.warn("No startup commands returned from {}, Startup command sending skipped", serverResource.getName()); + return; + } + String logId = Optional.ofNullable(ThreadContext.get("logcontextid")) + .map(String.class::cast).orElse(null); + String msHostList = _agent.getPersistentProperty("host"); + // need to downcast StartupCommand[] to Command[], otherwise logger will fail to decode JSON on MS side + Command[] commands = new Command[startup.length]; + for (int i = 0; i < startup.length; i++) { + StartupCommand command = startup[i]; + commands[i] = command; + _agent.setupStartupCommand(command); + command.setMSHostList(msHostList); + command.setConnectionTransferred(connectionTransfer); + if (logId != null) { + command.setContextParam("logid", logId); + } + } + String commandName = commands[0].getClass().getSimpleName(); + boolean needAdditionalValidation = false; + try { + logger.debug("Sending command {} to {}", commandName, link); + var answer = send(attache, commands, StartupAnswer.class, DEFAULT_ASYNC_STARTUP_COMMAND_TIMEOUT_SEC); + logger.info("Received answer for {} from {}: {}, cancelling", commandName, link, answer); + } catch (RuntimeException e) { + Throwable cause = e.getCause(); + if (cause instanceof OperationTimedoutException) { + String msg = String.format("Failed to retrieve answer for the command %s, sent to %s", + commandName, link); + logger.error(msg, e); + needAdditionalValidation = true; + } else { + String msg = String.format("Failed to send command %s to %s", commandName, link); + logger.error(msg, e); + throw new IllegalStateException(msg, e); + } + } + if (needAdditionalValidation) { + AgentConnectStatusAnswer answer = getAgentConnectStatusAnswer(attache); + if (answer == null) { + logger.warn("Received empty agent connect status answer, reconnecting"); + _agent.getRequestHandler().submit(() -> _agent.reconnect(link, null, connectionTransfer)); + return; + } + Boolean lockAvailable = answer.isLockAvailable(); + Status status = answer.getHostStatus(); + if (Boolean.TRUE.equals(lockAvailable) && status != null && operationalStatuses.contains(status)) { + logger.info("Host is in operational state {} on {}", status, _link); + } else if (Boolean.FALSE.equals(lockAvailable)) { + logger.info("Host is locked and has state {} on {}", status, _link); + } else { + logger.info("Host is locked and has state {} on {}, reconnecting", status, _link); + _agent.getRequestHandler().submit(() -> _agent.reconnect(link, null, connectionTransfer)); + return; + } + } + if (serverResource instanceof ResourceStatusUpdater) { + ((ResourceStatusUpdater) serverResource).registerStatusUpdater(_agent); + } + + } + + public Agent getAgent() { + return _agent; + } + } + + interface AsyncSend { + Agent getAgent(); + + default T send(ServerAttache attache, Command[] commands, Class answerType, + int asyncCommandTimeoutSec) throws IOException { + String logId = Optional.ofNullable(ThreadContext.get("logcontextid")) + .filter(String.class::isInstance) + .map(String.class::cast) + .orElse(null); + if (logId != null) { + for (Command command : commands) { + if (command.getContextParam("logid") == null) { + command.setContextParam("logid", logId); + } + } + } + Link link = attache.getLink(); + String commandName = commands[0].getClass().getSimpleName(); + Long id = Optional.ofNullable(getAgent().getId()).orElse(-1L); + Request request = new Request(id, -1, commands, true, false); + Answer[] answers; + try { + answers = attache.send(request, asyncCommandTimeoutSec); + } catch (OperationTimedoutException e) { + String msg = String.format("Failed to retrieve answer for command %s to %s", commandName, link); + logger.error(msg, e); + throw new RuntimeException(msg, e); + } catch (CloudException e) { + String msg = String.format("Failed to send command %s to %s", commandName, link); + logger.error(msg, e); + Throwable cause = e.getCause(); + if (cause instanceof ClosedChannelException) { + throw (ClosedChannelException) cause; + } + throw new RuntimeException(msg, e); + } + if (ArrayUtils.isEmpty(answers)) { + String msg = String.format("Received empty %s response from %s", commandName, link); + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + Answer answer = answers[0]; + String details = answer.getDetails(); + if (!answer.getResult()) { + String msg = String.format("Received unsuccessful %s response status from %s: %s", commandName, link, + details); + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + String answerName = answer.getClass().getSimpleName(); + if (!answerType.isInstance(answer)) { + String msg = String.format("Received unexpected %s response type %s from %s: %s", commandName, + answerName, link, details); + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + return answerType.cast(answer); + } + } +} diff --git a/agent/src/main/java/com/cloud/agent/IAgentShell.java b/agent/src/main/java/com/cloud/agent/IAgentShell.java index 9eefa6d2eeee..6405f1b570be 100644 --- a/agent/src/main/java/com/cloud/agent/IAgentShell.java +++ b/agent/src/main/java/com/cloud/agent/IAgentShell.java @@ -54,6 +54,8 @@ public interface IAgentShell { BackoffAlgorithm getBackoffAlgorithm(); + void setBackoffAlgorithm(BackoffAlgorithm backoffAlgorithm); + int getPingRetries(); String getVersion(); diff --git a/agent/src/main/java/com/cloud/agent/ServerAttache.java b/agent/src/main/java/com/cloud/agent/ServerAttache.java new file mode 100644 index 000000000000..537d66cc76d6 --- /dev/null +++ b/agent/src/main/java/com/cloud/agent/ServerAttache.java @@ -0,0 +1,476 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.agent; + +import com.cloud.agent.api.Answer; +import com.cloud.agent.api.Command; +import com.cloud.agent.transport.Request; +import com.cloud.agent.transport.Response; +import com.cloud.exception.AgentUnavailableException; +import com.cloud.exception.CloudException; +import com.cloud.exception.OperationTimedoutException; +import com.cloud.utils.concurrency.NamedThreadFactory; +import com.cloud.utils.nio.Link; +import org.apache.cloudstack.managed.context.ManagedContextRunnable; +import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils; +import org.apache.commons.lang3.ArrayUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.ThreadContext; + +import java.io.IOException; +import java.nio.channels.ClosedChannelException; +import java.security.SecureRandom; +import java.util.Collections; +import java.util.Comparator; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.Map; +import java.util.Optional; +import java.util.Random; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +/** + * ServerAttache provides basic server communication commands to be implemented. + * + * @author mprokopchuk + */ +public class ServerAttache { + private static final Logger logger = LogManager.getLogger(ServerAttache.class); + + private static final ScheduledExecutorService s_listenerExecutor = Executors.newScheduledThreadPool(10, + new NamedThreadFactory("ListenerTimer")); + + protected static Comparator s_reqComparator = (o1, o2) -> { + long seq1 = o1.getSequence(); + long seq2 = o2.getSequence(); + if (seq1 < seq2) { + return -1; + } else if (seq1 > seq2) { + return 1; + } else { + return 0; + } + }; + + protected static Comparator s_seqComparator = (o1, o2) -> { + long seq1 = ((Request) o1).getSequence(); + long seq2 = (Long) o2; + if (seq1 < seq2) { + return -1; + } else if (seq1 > seq2) { + return 1; + } else { + return 0; + } + }; + + private static final Random s_rand = new SecureRandom(); + protected String _name; + private Link _link; + protected ConcurrentHashMap _waitForList; + protected LinkedList _requests; + protected Long _currentSequence; + protected long _nextSequence; + + protected ServerAttache(Link link) { + _name = link.getIpAddress(); + _link = link; + _waitForList = new ConcurrentHashMap<>(); + _requests = new LinkedList<>(); + _nextSequence = Long.valueOf(s_rand.nextInt(Short.MAX_VALUE)) << 48; + } + + @Override + public String toString() { + return String.format("ServerAttache %s", ReflectionToStringBuilderUtils.reflectOnlySelectedFields(this, + "_name")); + } + + public synchronized long getNextSequence() { + return ++_nextSequence; + } + + protected synchronized void addRequest(Request req) { + int index = findRequest(req); + assert (index < 0) : "How can we get index again? " + index + ":" + req.toString(); + _requests.add(-index - 1, req); + } + + protected void cancel(Request req) { + cancel(req.getSequence()); + } + + protected synchronized void cancel(long seq) { + logger.debug(log(seq, "Cancelling.")); + + ServerListener listener = _waitForList.remove(seq); + if (listener != null) { + listener.processDisconnect(); + } + int index = findRequest(seq); + if (index >= 0) { + _requests.remove(index); + } + } + + protected synchronized int findRequest(Request req) { + return Collections.binarySearch(_requests, req, s_reqComparator); + } + + protected synchronized int findRequest(long seq) { + return Collections.binarySearch(_requests, seq, s_seqComparator); + } + + protected String log(long seq, String msg) { + return "Seq " + _name + "-" + seq + ": " + msg; + } + + protected void registerListener(long seq, ServerListener listener) { + if (logger.isTraceEnabled()) { + logger.trace(log(seq, "Registering listener")); + } + if (listener.getTimeout() != -1) { + s_listenerExecutor.schedule(new Alarm(seq), listener.getTimeout(), TimeUnit.SECONDS); + } + _waitForList.put(seq, listener); + } + + protected ServerListener unregisterListener(long sequence) { + if (logger.isTraceEnabled()) { + logger.trace(log(sequence, "Unregistering listener")); + } + return _waitForList.remove(sequence); + } + + protected ServerListener getListener(long sequence) { + return _waitForList.get(sequence); + } + + public String getName() { + return _name; + } + + public int getQueueSize() { + return _requests.size(); + } + + public boolean processAnswers(long seq, Response resp) { + resp.logD("Processing: ", true); + boolean processed = false; + Answer[] answers = resp.getAnswers(); + try { + ServerListener monitor = getListener(seq); + if (monitor == null) { + if (answers[0] != null && answers[0].getResult()) { + processed = true; + } + logger.debug(log(seq, "Unable to find listener.")); + } else { + processed = monitor.processAnswers(seq, answers); + logger.trace(log(seq, (processed ? "" : " did not ") + " processed ")); + if (!monitor.isRecurring()) { + unregisterListener(seq); + } + } + } finally { + // we should always trigger next command execution, even in failure cases - otherwise in exception case + // all the remaining will be stuck in the sync queue forever + if (resp.executeInSequence()) { + sendNext(seq); + } + } + return processed; + } + + protected void cancelAllCommands() { + for (Iterator> it = _waitForList.entrySet().iterator(); it.hasNext(); ) { + Map.Entry entry = it.next(); + it.remove(); + + ServerListener monitor = entry.getValue(); + logger.debug(log(entry.getKey(), "Sending disconnect to " + monitor.getClass())); + monitor.processDisconnect(); + } + } + + public void cleanup() { + cancelAllCommands(); + _requests.clear(); + } + + @Override + public boolean equals(Object obj) { + // Return false straight away. + if (obj == null) { + return false; + } + // No need to handle a ClassCastException. If the classes are different, then equals can return false + // straight ahead. + if (this.getClass() != obj.getClass()) { + return false; + } + ServerAttache that = (ServerAttache) obj; + return _name.equals(that.getName()); + } + + public void send(Request req, ServerListener listener) throws CloudException { + long seq = req.getSequence(); + if (listener != null) { + registerListener(seq, listener); + } + + synchronized (this) { + try { + if (isClosed()) { + throw new CloudException("The link to the server " + _name + " has been closed", new ClosedChannelException()); + } + + if (req.executeInSequence() && _currentSequence != null) { + req.logD("Waiting for Seq " + _currentSequence + " Scheduling: ", true); + addRequest(req); + return; + } + + // If we got to here either we're not supposed to set the _currentSequence or it is null already. + req.logD("Sending ", true); + send(req); + + if (req.executeInSequence() && _currentSequence == null) { + _currentSequence = seq; + logger.trace(log(seq, " is current sequence")); + } + } catch (CloudException e) { + logger.info(log(seq, "Unable to send due to " + e.getMessage()), e); + cancel(seq); + throw e; + } catch (Exception e) { + logger.warn(log(seq, "Unable to send due to " + e.getMessage()), e); + cancel(seq); + throw new CloudException("Problem due to other exception " + e.getMessage(), e); + } + } + } + + public Answer[] send(Request req, int wait) throws CloudException { + if (req.getSequence() <= 0) { + req.setSequence(getNextSequence()); + } + SynchronousListener sl = new SynchronousListener(); + sl.setTimeout(wait + 5); + send(req, sl); + + long seq = req.getSequence(); + try { + for (int i = 0; i < 2; i++) { + Answer[] answers = null; + try { + answers = sl.waitFor(wait); + } catch (InterruptedException e) { + logger.debug(log(seq, "Interrupted")); + } + if (answers != null) { + new Response(req, answers).logD("Received: ", false); + return answers; + } + + answers = sl.getAnswers(); // Try it again. + if (answers != null) { + new Response(req, answers).logD("Received after timeout: ", true); + return answers; + } + + Long current = _currentSequence; + if (current != null && seq != current) { + if (logger.isDebugEnabled()) { + logger.debug(log(seq, "Waited too long.")); + } + throw new OperationTimedoutException(req.getCommands(), -1, seq, wait, false); + } + logger.debug(log(seq, "Waiting some more time because this is the current command")); + } + throw new OperationTimedoutException(req.getCommands(), -1, seq, wait * 2, true); + } catch (OperationTimedoutException e) { + logger.warn(log(seq, "Timed out on " + req)); + cancel(seq); + Long current = _currentSequence; + if (req.executeInSequence() && (current != null && current == seq)) { + sendNext(seq); + } + throw e; + } catch (Exception e) { + logger.warn(log(seq, "Exception while waiting for answer"), e); + cancel(seq); + Long current = _currentSequence; + if (req.executeInSequence() && (current != null && current == seq)) { + sendNext(seq); + } + throw new OperationTimedoutException(req.getCommands(), -1, seq, wait, false); + } finally { + unregisterListener(seq); + } + } + + protected synchronized void sendNext(long seq) { + _currentSequence = null; + if (_requests.isEmpty()) { + if (logger.isDebugEnabled()) { + logger.debug(log(seq, "No more commands found")); + } + return; + } + + Request req = _requests.pop(); + if (logger.isDebugEnabled()) { + logger.debug(log(req.getSequence(), "Sending now. is current sequence.")); + } + try { + send(req); + } catch (CloudException e) { + logger.debug(log(req.getSequence(), "Unable to send the next sequence")); + cancel(req.getSequence()); + } + _currentSequence = req.getSequence(); + } + + public void process(Answer[] answers) { + //do nothing + } + + /** + * sends the request asynchronously. + * + * @param req + * @throws AgentUnavailableException + */ + public synchronized void send(final Request req) throws CloudException { + try { + _link.send(req.toBytes()); + } catch (ClosedChannelException e) { + throw new CloudException("Channel is closed", e); + } + } + + /** + * Process disconnect. + */ + public void disconnect() { + synchronized (this) { + logger.debug("Processing Disconnect."); + if (_link != null) { + _link.close(); + _link.terminated(); + } + _link = null; + } + cancelAllCommands(); + _requests.clear(); + } + + /** + * Is the agent closed for more commands? + * + * @return true if unable to reach agent or false if reachable. + */ + protected boolean isClosed() { + return _link == null; + } + + public Link getLink() { + return _link; + } + + public T send(Long agentId, Command[] commands, Class answerType, int asyncCommandTimeoutSec) throws IOException { + String logId = Optional.ofNullable(ThreadContext.get("logcontextid")) + .filter(String.class::isInstance) + .map(String.class::cast) + .orElse(null); + if (logId != null) { + for (Command command : commands) { + if (command.getContextParam("logid") == null) { + command.setContextParam("logid", logId); + } + } + } + Link link = getLink(); + String commandName = commands[0].getClass().getSimpleName(); + + Long id = Optional.ofNullable(agentId).orElse(-1L); + Request request = new Request(id, -1, commands, true, false); + Answer[] answers; + try { + answers = send(request, asyncCommandTimeoutSec); + } catch (OperationTimedoutException e) { + String msg = String.format("Failed to retrieve answer for command %s to %s", commandName, link); + logger.error(msg, e); + throw new RuntimeException(msg, e); + } catch (CloudException e) { + String msg = String.format("Failed to send command %s to %s", commandName, link); + logger.error(msg, e); + Throwable cause = e.getCause(); + if (cause instanceof ClosedChannelException) { + throw (ClosedChannelException) cause; + } + throw new RuntimeException(msg, e); + } + if (ArrayUtils.isEmpty(answers)) { + String msg = String.format("Received empty %s response from %s", commandName, link); + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + Answer answer = answers[0]; + String details = answer.getDetails(); + if (!answer.getResult()) { + String msg = String.format("Received unsuccessful %s response status from %s: %s", commandName, link, + details); + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + String answerName = answer.getClass().getSimpleName(); + if (!answerType.isInstance(answer)) { + String msg = String.format("Received unexpected %s response type %s from %s: %s", commandName, + answerName, link, details); + logger.warn(msg); + throw new IllegalArgumentException(msg); + } + return answerType.cast(answer); + } + + protected class Alarm extends ManagedContextRunnable { + long _seq; + + public Alarm(long seq) { + _seq = seq; + } + + @Override + protected void runInContext() { + try { + ServerListener listener = unregisterListener(_seq); + if (listener != null) { + cancel(_seq); + listener.processTimeout(_seq); + } + } catch (Exception e) { + ServerAttache.logger.warn("Exception ", e); + } + } + } +} diff --git a/agent/src/main/java/com/cloud/agent/ServerListener.java b/agent/src/main/java/com/cloud/agent/ServerListener.java new file mode 100644 index 000000000000..569dfade9772 --- /dev/null +++ b/agent/src/main/java/com/cloud/agent/ServerListener.java @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.agent; + +import com.cloud.agent.api.Answer; +import com.cloud.agent.api.Command; + +/** + * There are several types of events that the Agent forwards. + *

+ * 1. Commands sent by the Agent. + * 2. Answers sent by the Management Server. + */ +public interface ServerListener { + + /** + * @param seq sequence number return by the send() method. + * @param answers answers to the commands. + * @return true if processed. false if not. + */ + boolean processAnswers(long seq, Answer[] answers); + + /** + * This method is called by the ServerHandler when Management Server sent + * a command to the server. + * In order to process these commands, the Server Attache Listener must be registered for commands. + * + * @param seq sequence number of the command sent. + * @param commands commands that were sent. + * @return true if you processed the commands. false if not. + */ + boolean processCommands(long seq, Command[] commands); + + /** + * This method is called by ServerHandler when an agent disconnects + * from the Management Server if the listener has been registered for host events. + *

+ * If the Listener is passed to the send() method, this method is + * also called by ServerHandler if the agent disconnected. + */ + boolean processDisconnect(); + + /** + * If this Listener is passed to the send() method, this method + * is called by ServerHandler after processing an answer + * from the agent. + * Returning true means you're expecting more + * answers from the agent using the same sequence number. + * + * @return true if expecting more answers using the same sequence number. + */ + boolean isRecurring(); + + /** + * If the Listener is passed to the send() method, this method is + * called to determine how long to wait for the reply. The value + * is in seconds. -1 indicates to wait forever. 0 indicates to + * use the default timeout. If the timeout is + * reached, processTimeout on this same Listener is called. + * + * @return timeout in seconds before processTimeout is called. + */ + default int getTimeout() { + return -1; + } + + /** + * If the Listener is passed to the send() method, this method is + * called by the ServerHandler to process a command timeout. + * + * @param seq sequence number returned by send(). + * @return true if processed; false if not. + */ + default boolean processTimeout(long seq) { + return false; + } +} diff --git a/agent/src/main/java/com/cloud/agent/SynchronousListener.java b/agent/src/main/java/com/cloud/agent/SynchronousListener.java new file mode 100644 index 000000000000..7704eb8a341c --- /dev/null +++ b/agent/src/main/java/com/cloud/agent/SynchronousListener.java @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.agent; + +import com.cloud.agent.api.Answer; +import com.cloud.agent.api.Command; +import com.cloud.utils.Profiler; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +/** + * Blocks the thread and waits for the answer. + * + * @author mprokopchuk + */ +public class SynchronousListener implements ServerListener { + private static final Logger logger = LogManager.getLogger(SynchronousListener.class); + + protected Answer[] _answers; + protected boolean _disconnected; + private int _timeout; + + public synchronized Answer[] getAnswers() { + return _answers; + } + + @Override + public boolean isRecurring() { + return false; + } + + public synchronized boolean isDisconnected() { + return _disconnected; + } + + @Override + public synchronized boolean processAnswers(long seq, Answer[] resp) { + _answers = resp; + notifyAll(); + return true; + } + + @Override + public boolean processCommands(long seq, Command[] commands) { + return false; + } + + @Override + public synchronized boolean processDisconnect() { + logger.trace("Server disconnected. Will notify waiters"); + _disconnected = true; + notifyAll(); + return true; + } + + public synchronized Answer[] waitFor(int s) throws InterruptedException { + if (_disconnected) { + return null; + } + + if (_answers != null) { + return _answers; + } + + Profiler profiler = new Profiler(); + profiler.start(); + if (s <= 0) { + wait(); + } else { + int ms = s * 1000; + wait(ms); + } + profiler.stop(); + + logger.trace("Synchronized command - sending completed, time: {}, answer: {}", + profiler.getDurationInMillis(), _answers != null ? _answers[0].toString() : "null"); + return _answers; + } + + @Override + public int getTimeout() { + return _timeout; + } + + public void setTimeout(int timeout) { + _timeout = timeout; + } +} diff --git a/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java b/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java index b0926b23a1eb..199ec5dde2b4 100644 --- a/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java +++ b/agent/src/main/java/com/cloud/agent/properties/AgentProperties.java @@ -864,10 +864,31 @@ public Property getWorkers() { /** * Timeout for SSL handshake in seconds * Data type: Integer.
- * Default value: null + * Default value: 30 */ public static final Property SSL_HANDSHAKE_TIMEOUT = new Property<>("ssl.handshake.timeout", 30, Integer.class); + /** + * Timeout in seconds for asynchronous command to be sent and response received, see {@code wait} configuration key in Management Server. + * Data type: Integer. + * Default value: 180 + */ + public static final Property ASYNC_COMMAND_TIMEOUT_SEC = new Property<>("async.command.timeout.sec", 180, Integer.class); + + /** + * Timeout in seconds for asynchronous startup command to be sent and response received, see {@code wait} configuration key in Management Server. + * Data type: Integer. + * Default value: 300 + */ + public static final Property ASYNC_STARTUP_COMMAND_TIMEOUT_SEC = new Property<>("async.startup.command.timeout.sec", 300, Integer.class); + + /** + * Delay in seconds between host status checks on the agent side. + * Data type: Integer. + * Default value: 15 + */ + public static final Property AGENT_HOST_STATUS_CHECK_DELAY_SEC = new Property<>("agent.host.status.check.delay.sec", 15, Integer.class); + /** * Timeout (in seconds) to wait for the incremental snapshot to complete. * */ diff --git a/agent/src/test/java/com/cloud/agent/AgentTest.java b/agent/src/test/java/com/cloud/agent/AgentTest.java index 65dc030ebd76..0932fdc82f98 100644 --- a/agent/src/test/java/com/cloud/agent/AgentTest.java +++ b/agent/src/test/java/com/cloud/agent/AgentTest.java @@ -21,6 +21,7 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertSame; import static org.junit.Assert.assertTrue; +import static org.mockito.ArgumentMatchers.isA; import static org.mockito.Mockito.any; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.doThrow; @@ -31,6 +32,8 @@ import static org.mockito.Mockito.when; import java.io.IOException; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; import java.net.InetSocketAddress; import javax.naming.ConfigurationException; @@ -67,21 +70,18 @@ public void setUp() throws ConfigurationException { } @Test - public void testGetLinkLogNullLinkReturnsEmptyString() { - Link link = null; - String result = agent.getLinkLog(link); - assertEquals("", result); - } - - @Test - public void testGetLinkLogLinkWithTraceEnabledReturnsLinkLogWithHashCode() { - Link link = mock(Link.class); - InetSocketAddress socketAddress = new InetSocketAddress("192.168.1.100", 1111); - when(link.getSocketAddress()).thenReturn(socketAddress); + public void testGetLinkLogLinkWithTraceEnabledReturnsLinkLogWithHashCode() throws ReflectiveOperationException { + Link link = new Link(new InetSocketAddress("192.168.1.100", 8250), null); + // fix LOGGER access to inject mock + Field field = link.getClass().getDeclaredField("LOGGER"); + field.setAccessible(true); + Field modifiersField = Field.class.getDeclaredField("modifiers"); + modifiersField.setAccessible(true); + modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL); + field.set(null, logger); when(logger.isTraceEnabled()).thenReturn(true); - String result = agent.getLinkLog(link); - System.out.println(result); + String result = link.toString(); assertTrue(result.startsWith(System.identityHashCode(link) + "-")); assertTrue(result.contains("192.168.1.100")); } @@ -115,7 +115,6 @@ public void testAgentInitialization() { when(shell.getPingRetries()).thenReturn(3); when(shell.getWorkers()).thenReturn(5); agent.setupShutdownHookAndInitExecutors(); - assertNotNull(agent.selfTaskExecutor); assertNotNull(agent.outRequestHandler); assertNotNull(agent.requestHandler); } @@ -211,7 +210,10 @@ public void testCloseAndTerminateLinkLinkIsNullDoesNothing() { @Test public void testCloseAndTerminateLinkValidLinkCallsCloseAndTerminate() { Link mockLink = mock(Link.class); + ServerAttache attache = new ServerAttache(mockLink); + when(mockLink.attachment()).thenReturn(attache); agent.closeAndTerminateLink(mockLink); + verify(mockLink).attachment(); verify(mockLink).close(); verify(mockLink).terminated(); } @@ -219,14 +221,14 @@ public void testCloseAndTerminateLinkValidLinkCallsCloseAndTerminate() { @Test public void testStopAndCleanupConnectionConnectionIsNullDoesNothing() { agent.connection = null; - agent.stopAndCleanupConnection(false); + agent.stopAndCleanupConnection(); } @Test public void testStopAndCleanupConnectionValidConnectionNoWaitStopsAndCleansUp() throws IOException { NioConnection mockConnection = mock(NioConnection.class); agent.connection = mockConnection; - agent.stopAndCleanupConnection(false); + agent.stopAndCleanupConnection(); verify(mockConnection).stop(); verify(mockConnection).cleanUp(); } @@ -236,9 +238,9 @@ public void testStopAndCleanupConnectionCleanupThrowsIOExceptionLogsWarning() th NioConnection mockConnection = mock(NioConnection.class); agent.connection = mockConnection; doThrow(new IOException("Cleanup failed")).when(mockConnection).cleanUp(); - agent.stopAndCleanupConnection(false); + agent.stopAndCleanupConnection(); verify(mockConnection).stop(); - verify(logger).warn(eq("Fail to clean up old connection. {}"), any(IOException.class)); + verify(logger).warn(eq("Fail to clean up old connection"), isA(IOException.class)); } @Test @@ -249,9 +251,47 @@ public void testStopAndCleanupConnectionValidConnectionWaitForStopWaitsForStartu agent.connection = mockConnection; when(shell.getBackoffAlgorithm()).thenReturn(mockBackoff); when(mockConnection.isStartup()).thenReturn(true, true, false); - agent.stopAndCleanupConnection(true); - verify(mockConnection).stop(); + agent.stopAndCleanupConnection(); + verify(mockConnection, times(3)).stop(); verify(mockConnection).cleanUp(); - verify(mockBackoff, times(3)).waitBeforeRetry(); + verify(mockBackoff, times(2)).waitBeforeRetry(); + } + + @Test + public void testSelectReconnectionHostWithPreferredHost() { + Link mockLink = mock(Link.class); + // No need to stub mockLink or shell since preferred host short-circuits + String result = agent.selectReconnectionHost("preferred.host.com", mockLink); + assertEquals("preferred.host.com", result); + verify(shell, times(0)).getNextHost(); + } + + @Test + public void testSelectReconnectionHostWithNullPreferredHostUsesLinkAddress() { + Link mockLink = mock(Link.class); + InetSocketAddress socketAddress = new InetSocketAddress("192.168.1.100", 8080); + when(mockLink.getSocketAddress()).thenReturn(socketAddress); + // No need to stub shell.getNextHost() since link address is used + String result = agent.selectReconnectionHost(null, mockLink); + assertEquals("192.168.1.100", result); + verify(shell, times(0)).getNextHost(); + } + + @Test + public void testSelectReconnectionHostWithNullLinkUsesShellNextHost() { + when(shell.getNextHost()).thenReturn("fallback.host.com"); + String result = agent.selectReconnectionHost(null, null); + assertEquals("fallback.host.com", result); + verify(shell, times(1)).getNextHost(); + } + + @Test + public void testSelectReconnectionHostWithNullSocketAddressUsesShellNextHost() { + Link mockLink = mock(Link.class); + when(mockLink.getSocketAddress()).thenReturn(null); + when(shell.getNextHost()).thenReturn("fallback.host.com"); + String result = agent.selectReconnectionHost(null, mockLink); + assertEquals("fallback.host.com", result); + verify(shell, times(1)).getNextHost(); } } diff --git a/agent/src/test/java/com/cloud/agent/HostConnectProcessTest.java b/agent/src/test/java/com/cloud/agent/HostConnectProcessTest.java new file mode 100644 index 000000000000..5be4ca66b596 --- /dev/null +++ b/agent/src/test/java/com/cloud/agent/HostConnectProcessTest.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.agent; + +import com.cloud.exception.CloudException; +import com.cloud.utils.nio.Link; +import org.apache.logging.log4j.Logger; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mockito.junit.MockitoJUnitRunner; +import org.springframework.test.util.ReflectionTestUtils; + +import javax.naming.ConfigurationException; + +import static org.mockito.Mockito.mock; + +@RunWith(MockitoJUnitRunner.class) +public class HostConnectProcessTest { + + private Agent agent; + private Logger logger; + private Link link; + private ServerAttache attache; + private HostConnectProcess hostConnectProcess; + private boolean connectionTransfer; + + @Before + public void setUp() throws ConfigurationException { + agent = mock(Agent.class); + logger = mock(Logger.class); + link = mock(Link.class); + attache = mock(ServerAttache.class); + hostConnectProcess = new HostConnectProcess(agent); + ReflectionTestUtils.setField(agent, "logger", logger); + } + + @Test + public void testScheduleConnectProcess() throws InterruptedException, CloudException { + + hostConnectProcess.scheduleConnectProcess(link, connectionTransfer); + Assert.assertTrue(hostConnectProcess.isInProgress()); + } +} diff --git a/core/src/main/java/com/cloud/agent/api/AgentConnectStatusAnswer.java b/core/src/main/java/com/cloud/agent/api/AgentConnectStatusAnswer.java new file mode 100644 index 000000000000..e7c9e9acf461 --- /dev/null +++ b/core/src/main/java/com/cloud/agent/api/AgentConnectStatusAnswer.java @@ -0,0 +1,65 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +package com.cloud.agent.api; + +import com.cloud.host.HostVO; +import com.cloud.host.Status; +import com.cloud.utils.db.GlobalLock; + +/** + * Answer for {@link AgentConnectStatusCommand}. + * + * @author mprokopchuk + */ +public class AgentConnectStatusAnswer extends Answer { + + /** + * {@link Boolean#TRUE} means host has {@link GlobalLock#lock(int)} acquired, otherwise {@link Boolean#FALSE}, + * and null if there is an error during executing {@link AgentConnectStatusCommand}. + */ + private Boolean lockAvailable; + /** + * Represents value of {@link HostVO#getStatus()}. + */ + private Status hostStatus; + + public AgentConnectStatusAnswer() { + } + + public AgentConnectStatusAnswer(Command command, boolean success, String details) { + super(command, success, details); + } + + public Boolean isLockAvailable() { + return lockAvailable; + } + + public void setLockAvailable(Boolean lockAvailable) { + this.lockAvailable = lockAvailable; + } + + public Status getHostStatus() { + return hostStatus; + } + + public void setHostStatus(Status hostStatus) { + this.hostStatus = hostStatus; + } +} diff --git a/core/src/main/java/com/cloud/agent/api/AgentConnectStatusCommand.java b/core/src/main/java/com/cloud/agent/api/AgentConnectStatusCommand.java new file mode 100644 index 000000000000..6eb2ba8b1355 --- /dev/null +++ b/core/src/main/java/com/cloud/agent/api/AgentConnectStatusCommand.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.agent.api; + +/** + * Command to check status of {@link StartupCommand} from the Agent. + * + * @author mprokopchuk + */ +public class AgentConnectStatusCommand extends Command { + + private String hostGuid; + private String hostName; + private Long hostId; + + public Long getHostId() { + return hostId; + } + + public void setHostId(Long hostId) { + this.hostId = hostId; + } + + public String getHostGuid() { + return hostGuid; + } + + public void setHostGuid(String hostGuid) { + this.hostGuid = hostGuid; + } + + public String getHostName() { + return hostName; + } + + public void setHostName(String hostName) { + this.hostName = hostName; + } + + @Override + public boolean executeInSequence() { + return false; + } +} diff --git a/core/src/main/java/com/cloud/agent/api/StartupAnswer.java b/core/src/main/java/com/cloud/agent/api/StartupAnswer.java index c619ce75ace6..aa87d2d43690 100644 --- a/core/src/main/java/com/cloud/agent/api/StartupAnswer.java +++ b/core/src/main/java/com/cloud/agent/api/StartupAnswer.java @@ -19,13 +19,20 @@ package com.cloud.agent.api; +import java.util.HashMap; +import java.util.Map; + public class StartupAnswer extends Answer { long hostId; String hostName; String hostUuid; int pingInterval; + Integer agentHostStatusCheckDelaySec; + private Map params; + protected StartupAnswer() { + params = new HashMap<>(); } public StartupAnswer(StartupCommand cmd, long hostId, String hostUuid, String hostName, int pingInterval) { @@ -34,10 +41,12 @@ public StartupAnswer(StartupCommand cmd, long hostId, String hostUuid, String ho this.hostUuid = hostUuid; this.hostName = hostName; this.pingInterval = pingInterval; + params = new HashMap<>(); } public StartupAnswer(StartupCommand cmd, String details) { super(cmd, false, details); + params = new HashMap<>(); } public long getHostId() { @@ -55,4 +64,20 @@ public String getHostName() { public int getPingInterval() { return pingInterval; } + + public Map getParams() { + return params; + } + + public void setParams(Map params) { + this.params = params; + } + + public Integer getAgentHostStatusCheckDelaySec() { + return agentHostStatusCheckDelaySec; + } + + public void setAgentHostStatusCheckDelaySec(Integer agentHostStatusCheckDelaySec) { + this.agentHostStatusCheckDelaySec = agentHostStatusCheckDelaySec; + } } diff --git a/core/src/main/java/com/cloud/agent/transport/Request.java b/core/src/main/java/com/cloud/agent/transport/Request.java index 3769dbbd612c..a8b3d271ecdd 100644 --- a/core/src/main/java/com/cloud/agent/transport/Request.java +++ b/core/src/main/java/com/cloud/agent/transport/Request.java @@ -404,7 +404,7 @@ protected String log(String msg, boolean logContent, Level level) { try { _cmds = s_gson.fromJson(_content, this instanceof Response ? Answer[].class : Command[].class); } catch (RuntimeException e) { - LOGGER.error("Unable to deserialize from json: " + _content); + LOGGER.error("Unable to deserialize from json: " + _content, e); throw e; } } diff --git a/core/src/main/java/com/cloud/resource/ServerResource.java b/core/src/main/java/com/cloud/resource/ServerResource.java index 23d200942a27..6fd80613715b 100644 --- a/core/src/main/java/com/cloud/resource/ServerResource.java +++ b/core/src/main/java/com/cloud/resource/ServerResource.java @@ -84,7 +84,8 @@ default StartupCommand[] initialize(boolean isTransferredConnection) { void setAgentControl(IAgentControl agentControl); default boolean isExitOnFailures() { - return true; + // true would cause unnecessary Agent service restart, don't want it by default + return false; } default boolean isAppendAgentNameToLogs() { diff --git a/core/src/main/java/org/apache/cloudstack/threadcontext/ThreadContextUtil.java b/core/src/main/java/org/apache/cloudstack/threadcontext/ThreadContextUtil.java new file mode 100644 index 000000000000..e0cef747f5c3 --- /dev/null +++ b/core/src/main/java/org/apache/cloudstack/threadcontext/ThreadContextUtil.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.cloudstack.threadcontext; + +import org.apache.logging.log4j.ThreadContext; + +import java.util.HashMap; +import java.util.Map; + +/** + * Utility class, helps to propagate {@link ThreadContext} values from parent to child threads. + * + * @author mprokopchuk + */ +public class ThreadContextUtil { + /** + * Wrap {@link Runnable} to propagate {@link ThreadContext} values. + * + * @param delegate + * @return + */ + public static Runnable wrapThreadContext(Runnable delegate) { + @SuppressWarnings("unchecked") + Map context = ThreadContext.getContext() != null ? + new HashMap<>(ThreadContext.getContext()) : null; + + return () -> { + @SuppressWarnings("unchecked") + Map oldContext = ThreadContext.getContext() != null ? + new HashMap<>(ThreadContext.getContext()) : null; + try { + ThreadContext.clearMap(); + if (context != null) { + context.forEach(ThreadContext::put); + } + delegate.run(); + } finally { + ThreadContext.clearMap(); + if (oldContext != null) { + oldContext.forEach(ThreadContext::put); + } + } + }; + } +} diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentAttache.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentAttache.java index 0a6bbc876544..c8c62ad70ef5 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentAttache.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentAttache.java @@ -193,7 +193,7 @@ protected void checkAvailability(final Command[] cmds) throws AgentUnavailableEx if (_maintenance) { for (final Command cmd : cmds) { if (Arrays.binarySearch(s_commandsAllowedInMaintenanceMode, cmd.getClass().toString()) < 0 && !cmd.isBypassHostMaintenance()) { - throw new AgentUnavailableException("Unable to send " + cmd.getClass().toString() + " because agent " + _name + " is in maintenance mode", _id); + throw new AgentUnavailableException("Unable to send " + cmd.getClass() + " because agent " + _name + " is in maintenance mode", _id); } } } @@ -201,7 +201,7 @@ protected void checkAvailability(final Command[] cmds) throws AgentUnavailableEx if (_status == Status.Connecting) { for (final Command cmd : cmds) { if (Arrays.binarySearch(s_commandsNotAllowedInConnectingMode, cmd.getClass().toString()) >= 0) { - throw new AgentUnavailableException("Unable to send " + cmd.getClass().toString() + " because agent " + _name + " is in connecting mode", _id); + throw new AgentUnavailableException("Unable to send " + cmd.getClass() + " because agent " + _name + " is in connecting mode", _id); } } } @@ -396,13 +396,13 @@ public void send(final Request req, final Listener listener) throws AgentUnavail logger.trace(LOG_SEQ_FORMATTED_STRING, seq, " is current sequence"); } } catch (AgentUnavailableException e) { - logger.info(LOG_SEQ_FORMATTED_STRING, seq, "Unable to send due to " + e.getMessage()); + logger.info(LOG_SEQ_FORMATTED_STRING, seq, "Unable to send due to " + e.getMessage(), e); cancel(seq); throw e; } catch (Exception e) { logger.warn(LOG_SEQ_FORMATTED_STRING, seq, "Unable to send due to " + e.getMessage(), e); cancel(seq); - throw new AgentUnavailableException("Problem due to other exception " + e.getMessage(), _id); + throw new AgentUnavailableException("Problem due to other exception " + e.getMessage(), _id, e); } } } diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java index 527c5259376a..65059b61bbd9 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java @@ -19,15 +19,19 @@ import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; -import java.net.SocketAddress; +import java.net.InetSocketAddress; import java.nio.channels.ClosedChannelException; +import java.security.SecureRandom; +import java.time.Duration; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Objects; +import java.util.Optional; +import java.util.Random; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; @@ -42,8 +46,17 @@ import javax.inject.Inject; import javax.naming.ConfigurationException; +import com.cloud.agent.api.AgentConnectStatusAnswer; +import com.cloud.agent.api.AgentConnectStatusCommand; +import com.cloud.event.EventTypes; +import com.cloud.utils.DateUtil; +import com.cloud.utils.Profiler; import com.cloud.utils.StringUtils; +import com.cloud.utils.backoff.impl.ExponentialWithJitterBackoff; +import com.github.benmanes.caffeine.cache.Cache; +import com.github.benmanes.caffeine.cache.Caffeine; import org.apache.cloudstack.agent.lb.IndirectAgentLB; +import org.apache.cloudstack.annotation.dao.AnnotationDao; import org.apache.cloudstack.ca.CAManager; import org.apache.cloudstack.command.ReconcileCommandService; import org.apache.cloudstack.command.ReconcileCommandUtils; @@ -51,6 +64,7 @@ import org.apache.cloudstack.command.dao.ReconcileCommandDao; import org.apache.cloudstack.engine.orchestration.service.NetworkOrchestrationService; import org.apache.cloudstack.framework.config.ConfigKey; +import org.apache.cloudstack.framework.config.ConfigKeyUtil; import org.apache.cloudstack.framework.config.Configurable; import org.apache.cloudstack.framework.config.dao.ConfigurationDao; import org.apache.cloudstack.framework.jobs.AsyncJob; @@ -61,7 +75,6 @@ import org.apache.cloudstack.management.ManagementServerHost; import org.apache.cloudstack.outofbandmanagement.dao.OutOfBandManagementDao; import org.apache.cloudstack.utils.identity.ManagementServerNode; -import org.apache.cloudstack.utils.reflectiontostringbuilderutils.ReflectionToStringBuilderUtils; import org.apache.commons.collections.MapUtils; import org.apache.commons.lang3.BooleanUtils; import org.apache.commons.lang3.ObjectUtils; @@ -141,11 +154,33 @@ import com.cloud.utils.nio.Task; import com.cloud.utils.time.InaccurateClock; +import static com.cloud.utils.backoff.BackoffFactory.BACKOFF_IMPLEMENTATION_KEY; +import static com.cloud.utils.backoff.impl.ExponentialWithJitterBackoff.MAX_DELAY_MS_CONFIG_KEY; +import static com.cloud.utils.backoff.impl.ExponentialWithJitterBackoff.MAX_DELAY_MS_DEFAULT; +import static com.cloud.utils.backoff.impl.ExponentialWithJitterBackoff.MIN_DELAY_MS_CONFIG_KEY; +import static com.cloud.utils.backoff.impl.ExponentialWithJitterBackoff.MIN_DELAY_MS_DEFAULT; + /** * Implementation of the Agent Manager. This class controls the connection to the agents. **/ public class AgentManagerImpl extends ManagerBase implements AgentManager, HandlerFactory, ManagementServerMaintenanceListener, Configurable { + /** + * Base timeout in seconds to obtain {@link GlobalLock#lock(int)}. + */ + private static final int BASE_TIMEOUT_SEC = 60; + /** + * Jitter in seconds to obtain {@link GlobalLock#lock(int)} + */ + private static final int JITTER_TIMEOUT_SEC = 15; + + /** + * Those statuses used in + * {@link AgentManagerImpl#handleDisconnectWithoutInvestigation(AgentAttache, Event, boolean, boolean)} + * to identify Host status as connected + */ + private static final Set HOST_STATUSES_CONNECTED = Set.of(Status.Up, Status.Alert, Status.Connecting, Status.Creating, Status.Rebalancing); + /** * _agents is a ConcurrentHashMap, but it is used from within a synchronized block. This will be reported by findbugs as JLM_JSR166_UTILCONCURRENT_MONITORENTER. Maybe a * ConcurrentHashMap is not the right thing to use here, but i'm not sure so i leave it alone. @@ -219,11 +254,14 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl private boolean _reconcileCommandsEnabled = false; private Integer _reconcileCommandInterval; + private Random _random = new SecureRandom(); @Inject ResourceManager _resourceMgr; @Inject ManagementServiceConfiguration mgmtServiceConf; + @Inject + private AnnotationDao annotationDao; protected final ConfigKey Workers = new ConfigKey<>("Advanced", Integer.class, "workers", "5", "Number of worker threads handling remote agent connections.", false); @@ -249,6 +287,78 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl protected final ConfigKey CheckTxnBeforeSending = new ConfigKey<>("Developer", Boolean.class, "check.txn.before.sending.agent.commands", "false", "This parameter allows developers to enable a check to see if a transaction wraps commands that are sent to the resource. This is not to be enabled on production systems.", true); + protected final ConfigKey BackoffConfiguration = new ConfigKey<>("Advanced", String.class, "backoff.configuration", String.format("%s=%s;%s=%s;%s=%s", BACKOFF_IMPLEMENTATION_KEY, + ExponentialWithJitterBackoff.class.getName(), MIN_DELAY_MS_CONFIG_KEY, MIN_DELAY_MS_DEFAULT, + MAX_DELAY_MS_CONFIG_KEY, MAX_DELAY_MS_DEFAULT), + "Backoff algorithm configuration in format key1=value1;key2=value2;...", true); + protected final ConfigKey AgentHostStatusCheckDelay = new ConfigKey("Advanced", Integer.class, "agent.host.status.check.delay.sec", "15", + "Delay in seconds between host status checks on the agent side", true); + protected final ConfigKey AliveHostStatuses = new ConfigKey("Advanced", String.class, "alive.host.statuses", "Up,Creating,Connecting,Rebalancing", + "Comma-separated list of host statuses that are considered alive and do not require agent reconnect. Valid values: Up, Alert, Connecting, Creating, Rebalancing, Down, Disconnected, Removed.", + false); + protected ConfigKey PerformFullDisconnectOnAgentDisconnectEventBroadcast = new ConfigKey("Advanced", Boolean.class, "agent.disconnect.event.broadcast.perform.full.disconnect", "false", + "Performs a full host disconnect on AgentDisconnect event broadcast", true); + + /** + * Set of host statuses that are considered alive and do not require agent reconnect. + * Initialized once during startup from the configuration. + */ + private Set aliveHostStatuses; + + /** + * Initialize the set of host statuses that are considered alive based on the configuration + */ + private void initializeAliveHostStatuses() { + String statusesConfig = AliveHostStatuses.value(); + if (StringUtils.isEmpty(statusesConfig)) { + aliveHostStatuses = Set.of(Status.Up); + logger.debug(String.format("Using default alive host statuses: %s", aliveHostStatuses)); + return; + } + + Set configuredStatuses = parseConfiguredAliveHostStatuses(statusesConfig); + + aliveHostStatuses = configuredStatuses.isEmpty() ? Set.of(Status.Up) : configuredStatuses; + logger.info("Configured alive host statuses: {}", aliveHostStatuses); + } + + private Set parseConfiguredAliveHostStatuses(String statusesConfig) { + Set configuredStatuses = new HashSet<>(); + String[] statusNames = statusesConfig.split("\\s*,\\s*"); + logger.debug("Processing alive host statuses configuration: '{}'", statusesConfig); + for (String statusName : statusNames) { + if (StringUtils.isNotEmpty(statusName)) { + try { + Status status = null; + for (Status item : Status.values()) { + if (item.name().equalsIgnoreCase(statusName)) { + status = item; + break; + } + } + if (status != null) { + configuredStatuses.add(status); + logger.debug("Added status '{}' to alive host statuses", status); + } else { + logger.warn("Invalid status name '{}' in alive.host.statuses configuration, ignoring", statusName); + } + } catch (Exception e) { + logger.warn("Error processing status name '{}' in alive.host.statuses configuration: {}", statusName, e.getMessage()); + } + } + } + return configuredStatuses; + } + + /** + * Get the set of host statuses that are considered alive based on the configuration + */ + private Set getAliveHostStatuses() { + return aliveHostStatuses; + } + + protected MsCache msCache; + public static final List HOST_DOWN_ALERT_UNSUPPORTED_HOST_TYPES = Arrays.asList( Host.Type.SecondaryStorage, Host.Type.ConsoleProxy @@ -257,6 +367,9 @@ public class AgentManagerImpl extends ManagerBase implements AgentManager, Handl @Override public boolean configure(final String name, final Map params) throws ConfigurationException { + //TODO: externalize cache configuration + msCache = new MsCache(_mshostDao, 10, 10); + logger.info("Ping Timeout is {}.", mgmtServiceConf.getPingTimeout()); _nodeId = ManagementServerNode.getManagementServerId(); @@ -277,11 +390,11 @@ public boolean configure(final String name, final Map params) th maxConcurrentNewAgentConnections = RemoteAgentMaxConcurrentNewConnections.value(); - _connection = new NioServer("AgentManager", Port.value(), Workers.value() + 10, + _connection = new NioServer("AgentManager", Port.value(), Workers.value(), this, caService, RemoteAgentSslHandshakeTimeout.value()); - logger.info("Listening on {} with {} workers.", Port.value(), Workers.value()); + logger.info("Listening on {} with {} workers", Port.value(), Workers.value()); - final int directAgentPoolSize = DirectAgentPoolSize.value(); + int directAgentPoolSize = DirectAgentPoolSize.value(); // executes all agent commands other than cron and ping _directAgentExecutor = new ScheduledThreadPoolExecutor(directAgentPoolSize, new NamedThreadFactory("DirectAgent")); // executes cron and ping agent commands @@ -313,14 +426,27 @@ public int getNewConnectionsCount() { } @Override - public void registerNewConnection(SocketAddress address) { + public void registerNewConnection(InetSocketAddress address) { logger.trace("Adding new agent connection from {}", address.toString()); - newAgentConnections.putIfAbsent(address.toString(), System.currentTimeMillis()); + try { + String ipAddress = address.getAddress().getHostAddress(); + String msName = msCache.get(ipAddress); + if (msName != null) { + logger.info("New connection {} is from Management Server {} ({}), skipping registration", + address, ipAddress, msName); + } else { + logger.debug("Adding new agent connection ({} of {}) from {}", + newAgentConnections.size(), maxConcurrentNewAgentConnections, address); + newAgentConnections.putIfAbsent(address.toString(), System.currentTimeMillis()); + } + } catch (Exception e) { + logger.error("Failed to register new connection {}", address, e); + } } @Override - public void unregisterNewConnection(SocketAddress address) { - logger.trace("Removing new agent connection for {}", address.toString()); + public void unregisterNewConnection(InetSocketAddress address) { + logger.trace("Removing new agent connection ({} of {}) for {}", newAgentConnections.size(), maxConcurrentNewAgentConnections, address); newAgentConnections.remove(address.toString()); } @@ -454,6 +580,45 @@ private AgentControlAnswer handleControlCommand(final AgentAttache attache, fina return new AgentControlAnswer(cmd); } + private AgentConnectStatusAnswer handleAgentConnectStatusCommand(AgentAttache attache, AgentConnectStatusCommand cmd) { + HostVO hostVo = _hostDao.findById(attache.getId()); + return getConnectStatusAnswer(hostVo, cmd); + } + + private AgentConnectStatusAnswer getConnectStatusAnswer(HostVO hostVo, AgentConnectStatusCommand cmd) { + long hostId = hostVo.getId(); + String hostName = hostVo.getName(); + String lockName = getHostJoinLockName(hostId); + Status status = hostVo.getStatus(); + try { + boolean lockAvailable = GlobalLock.isLockAvailable(lockName); + String details = String.format("Global lock %s is%s present for %s", lockName, lockAvailable ? "" : " not", + hostName); + logger.debug(details); + return getAgentConnectStatusAnswer(cmd, lockName, hostName, lockAvailable, status, details); + } catch (RuntimeException e) { + String msg = String.format("Failed to check global lock %s presence for %s: %s", lockName, hostName, + e.getMessage()); + logger.warn(msg, e); + return new AgentConnectStatusAnswer(cmd, false, msg); + } + } + + /** + * Returns {@link AgentConnectStatusAnswer}. + */ + private AgentConnectStatusAnswer getAgentConnectStatusAnswer(AgentConnectStatusCommand cmd, + String lockName, String hostName, + boolean lockAvailable, + Status status, + String details) { + logger.debug(String.format("Checking whether global lock %s is present for %s", lockName, hostName)); + AgentConnectStatusAnswer answer = new AgentConnectStatusAnswer(cmd, true, details); + answer.setLockAvailable(lockAvailable); + answer.setHostStatus(status); + return answer; + } + public void handleCommands(final AgentAttache attache, final long sequence, final Command[] cmds) { for (final Pair listener : _cmdMonitors) { final boolean processed = listener.second().processCommands(attache.getId(), sequence, cmds); @@ -707,9 +872,7 @@ protected AgentAttache getAttache(final Long hostId) throws AgentUnavailableExce public long send(final Long hostId, final Commands commands, final Listener listener) throws AgentUnavailableException { final AgentAttache agent = getAttache(hostId); if (agent.isClosed()) { - throw new AgentUnavailableException(String.format( - "Agent [id: %d, name: %s] is closed", - agent.getId(), agent.getName()), agent.getId()); + throw new AgentUnavailableException("Agent is closed", agent.getId()); } final Command[] cmds = checkForCommandsAndTag(commands); @@ -721,33 +884,88 @@ public long send(final Long hostId, final Commands commands, final Listener list return req.getSequence(); } + /** + * Remove and disconnect agent. + * + * @param attache agent connection attache + * @param nextState agent state + */ public void removeAgent(final AgentAttache attache, final Status nextState) { if (attache == null) { return; } final long hostId = attache.getId(); - logger.debug("Remove Agent : {}", attache); + String hostName = attache.getName(); + String hostUuid = attache.getUuid(); + Profiler processProfiler = new Profiler(); + processProfiler.start(); + logger.debug("Process disconnect for host: {} ({})", hostName, hostUuid); + + /* Remove from ping map first before processing other monitors. This is to ensure we do not submit + same disconnects multiple times in scenarios when system is overloaded and when a particular + monitor's process disconnect takes too much time. + */ + _pingMap.remove(hostId); + logger.debug("Removed agent attache from ping map to prevent multiple disconnects: {}", attache); + + try { + processAgentDisconnect(attache, nextState, hostName, hostUuid, hostId); + // then remove agent + } finally { + if (logger.isDebugEnabled()) { + logger.debug("Remove Agent : {}", attache); + } + processProfiler.stop(); + long processDurationSec = processProfiler.getDurationInMillis() / 1000; + logger.debug("Finished process disconnect for host: {} ({}) duration, s: {}", hostName, hostUuid, processDurationSec); + + AgentAttache removed = removeAgent(hostId, attache); + if (removed != null) { + removed.disconnect(nextState); + } + } + } + + private void processAgentDisconnect(AgentAttache attache, Status nextState, String hostName, String hostUuid, long hostId) { + // first try to disconnect everything + int monitorsSize = _hostMonitors.size(); + for (int monitorIndex = 0; monitorIndex < monitorsSize; monitorIndex++) { + Profiler monitorProfiler = new Profiler(); + Pair monitor = _hostMonitors.get(monitorIndex); + Listener listener = monitor.second(); + String monitorClassName = listener.getClass().getSimpleName(); + try { + monitorProfiler.start(); + logger.debug("Process disconnect for host: {} ({}) listener: {} ({} of {})", + hostName, hostUuid, monitorClassName, monitorIndex, monitorsSize); + monitor.second().processDisconnect(hostId, attache.getUuid(), attache.getName(), nextState); + } finally { + monitorProfiler.stop(); + long monitorDurationSec = monitorProfiler.getDurationInMillis() / 1000; + logger.debug("Finished process disconnect for host: {} ({}) listener: {} ({} of {}) duration, s: {}", + hostName, hostUuid, monitorClassName, monitorIndex, monitorsSize, monitorDurationSec); + } + } + } + + /** + * Remove agent from the agents cache but does not perform disconnect. + * + * @param hostId agent host id + * @param attache agent connection attache + * @return removed agent connection attache + */ + protected AgentAttache removeAgent(long hostId, AgentAttache attache) { AgentAttache removed; - boolean conflict = false; synchronized (_agents) { removed = _agents.remove(hostId); if (removed != null && removed != attache) { - conflict = true; + logger.debug("Agent for host {} is created when it is being disconnected", attache); _agents.put(hostId, removed); removed = attache; } } - if (conflict) { - logger.debug("Agent for host {} is created when it is being disconnected", attache); - } - if (removed != null) { - removed.disconnect(nextState); - } - - for (final Pair monitor : _hostMonitors) { - logger.debug("Sending Disconnect to listener: {}", monitor.second().getClass().getName()); - monitor.second().processDisconnect(hostId, attache.getUuid(), attache.getName(), nextState); - } + return removed; } @Override @@ -762,40 +980,124 @@ public void notifyMonitorsOfNewlyAddedHost(long hostId) { protected AgentAttache notifyMonitorsOfConnection(final AgentAttache attache, final StartupCommand[] cmds, final boolean forRebalance) throws ConnectionException { final long hostId = attache.getId(); final HostVO host = _hostDao.findById(hostId); - for (final Pair monitor : _hostMonitors) { - logger.debug("Sending Connect to listener: {}, for rebalance: {}", monitor.second().getClass().getSimpleName(), forRebalance); - for (StartupCommand cmd : cmds) { - try { - logger.debug("process connection to issue: {} for host: {}, forRebalance: {}", ReflectionToStringBuilderUtils.reflectOnlySelectedFields(cmd, "id", "type", "msHostList", "connectionTransferred"), hostId, forRebalance); - monitor.second().processConnect(host, cmd, forRebalance); - } catch (final ConnectionException ce) { - if (ce.isSetupError()) { - logger.warn("Monitor {} says there is an error in the connect process for {} due to {}", monitor.second().getClass().getSimpleName(), hostId, ce.getMessage()); - handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, true, true); - throw ce; - } else { - logger.info("Monitor {} says not to continue the connect process for {} due to {}", monitor.second().getClass().getSimpleName(), hostId, ce.getMessage()); - handleDisconnectWithoutInvestigation(attache, Event.ShutdownRequested, true, true); - return attache; - } - } catch (final HypervisorVersionChangedException hvce) { - handleDisconnectWithoutInvestigation(attache, Event.ShutdownRequested, true, true); - throw new CloudRuntimeException("Unable to connect " + attache.getId(), hvce); - } catch (final Exception e) { - logger.error("Monitor {} says there is an error in the connect process for {} due to {}", monitor.second().getClass().getSimpleName(), hostId, e.getMessage(), e); - handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, true, true); - throw new CloudRuntimeException("Unable to connect " + attache.getId(), e); - } + Optional hostOpt = Optional.ofNullable(host); + String hostName = hostOpt.map(HostVO::getName).orElse(null); + String hostUuid = hostOpt.map(HostVO::getUuid).orElse(null); + boolean processSuccessful; + Profiler processProfiler = new Profiler(); + processProfiler.start(); + logger.debug("Process connect for host: {} ({}) for rebalance: {}", hostName, hostUuid, forRebalance); + try { + processSuccessful = performProcessConnect(attache, cmds, forRebalance, hostName, hostUuid, host); + } finally { + processProfiler.stop(); + long processDurationSec = processProfiler.getDurationInMillis() / 1000; + logger.debug("Finished process connect for host: {} ({}) for rebalance: {} duration, s: {}", + hostName, hostUuid, forRebalance, processDurationSec); + } + if (processSuccessful) { + sendReadyCommand(attache, host, hostId); + + agentStatusTransitTo(host, Event.Ready, _nodeId); + attache.ready(); + } + return attache; + } + + /** + * @return true if commands successfully completed + */ + private boolean performProcessConnect(AgentAttache attache, StartupCommand[] cmds, boolean forRebalance, String hostName, String hostUuid, HostVO host) throws ConnectionException { + int monitorsSize = _hostMonitors.size(); + boolean processSuccessful = true; + for (int monitorIndex = 0; monitorIndex < monitorsSize && processSuccessful; monitorIndex++) { + Pair monitor = _hostMonitors.get(monitorIndex); + Listener listener = monitor.second(); + String monitorClassName = listener.getClass().getSimpleName(); + Profiler monitorProfiler = new Profiler(); + logger.debug("Process connect for host: {} ({}) listener {} ({} of {}) for rebalance: {}", + hostName, hostUuid, monitorClassName, monitorIndex, monitorsSize, forRebalance); + try { + monitorProfiler.start(); + processSuccessful = performProcessConnect(attache, cmds, forRebalance, hostName, hostUuid, monitorClassName, listener, host); + } finally { + monitorProfiler.stop(); + long monitorDurationSec = monitorProfiler.getDurationInMillis() / 1000; + logger.debug("Finished process connect for host: {} ({}) listener: {} ({} of {}) for rebalance: {} duration, s: {}", + hostName, hostUuid, monitorClassName, monitorIndex, monitorsSize, forRebalance, monitorDurationSec); + } + } + return processSuccessful; + } + + /** + * @return true if commands successfully completed + */ + private boolean performProcessConnect(AgentAttache attache, StartupCommand[] cmds, boolean forRebalance, String hostName, String hostUuid, String monitorClassName, Listener listener, HostVO host) throws ConnectionException { + boolean processSuccessful = true; + int commandsSize = cmds.length; + for (int commandIndex = 0; commandIndex < commandsSize && processSuccessful; commandIndex++) { + StartupCommand command = cmds[commandIndex]; + String commandClassName = command.getClass().getSimpleName(); + boolean connTransferred = command.isConnectionTransferred(); + long processStart = System.currentTimeMillis(); + try { + logger.debug("Process command: {} ({} of {}) in connect for host: {} ({}) listener: {} for rebalance: {} connection transferred: {}", + commandClassName, commandIndex, commandsSize, hostName, hostUuid, monitorClassName, forRebalance, connTransferred); + listener.processConnect(host, command, forRebalance); + } catch (Exception e) { + handleConnectionException(attache, e, processStart, host, monitorClassName); + processSuccessful = false; + } finally { + long commandDurationSec = (System.currentTimeMillis() - processStart) / 1000; + logger.debug("Finished processing command: {} ({} of {}) in connect for host: {} ({}) listener: {} for rebalance: {} connection transferred: {} duration, s: {}", + commandClassName, commandIndex, commandsSize, hostName, hostUuid, monitorClassName, forRebalance, connTransferred, commandDurationSec); + } + } + return processSuccessful; + } + + private void handleConnectionException(AgentAttache attache, Exception e, long processStart, HostVO host, String monitorClassName) throws ConnectionException { + long processDurationMs = System.currentTimeMillis() - processStart; + StringBuilder summaryBuilder = getSummaryMsgBuilder("Failed to connect Host", + EventTypes.EVENT_HOST_RECONNECT, host.getUuid(), host.getName(), null, + e.getMessage(), monitorClassName, processDurationMs); + logger.fatal(summaryBuilder.toString(), e); + + // add alert to the Host here + if (e instanceof ConnectionException) { + ConnectionException ce = (ConnectionException) e; + // XXX: in case of Storage Pool issue we are ending up here + // Failed to establish connection with PowerFlex Gateway to check host SDC connection + if (ce.isSetupError()) { + logger.warn("Monitor {} says there is an error in the connect process for {} due to {}", + monitorClassName, host, e.getMessage()); + handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, true, true); + throw ce; + } else { + logger.info("Monitor {} says not to continue the connect process for {} due to {}", + monitorClassName, host, e.getMessage()); + handleDisconnectWithoutInvestigation(attache, Event.ShutdownRequested, true, true); } + } else if (e instanceof HypervisorVersionChangedException) { + handleDisconnectWithoutInvestigation(attache, Event.ShutdownRequested, true, true); + throw new CloudRuntimeException(String.format("Unable to connect %s", attache), e); + } else { + logger.error("Monitor {} says there is an error in the connect process for {} due to {}", + monitorClassName, host, e.getMessage(), e); + handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, true, true); + throw new CloudRuntimeException(String.format("Unable to connect %s", attache), e); } + } - final ReadyCommand ready = new ReadyCommand(host, NumbersUtil.enableHumanReadableSizes); + private void sendReadyCommand(AgentAttache attache, HostVO host, long hostId) { + ReadyCommand ready = new ReadyCommand(host, NumbersUtil.enableHumanReadableSizes); ready.setWait(ReadyCommandWait.value()); final Answer answer = easySend(hostId, ready); if (answer == null || !answer.getResult()) { // this is tricky part for secondary storage // make it as disconnected, wait for secondary storage VM to be up - // return the attache instead of null, even it is disconnectede + // return the attache instead of null, even it is disconnected handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, true, true); } if (answer instanceof ReadyAnswer) { @@ -827,10 +1129,6 @@ protected AgentAttache notifyMonitorsOfConnection(final AgentAttache attache, fi } } } - - agentStatusTransitTo(host, Event.Ready, _nodeId); - attache.ready(); - return attache; } @Override @@ -843,6 +1141,7 @@ public boolean start() { } initConnectExecutor(); + initializeAliveHostStatuses(); startDirectlyConnectedHosts(false); if (_connection != null) { @@ -1039,27 +1338,102 @@ protected Status getNextStatusOnDisconnection(Host host, final Status.Event even return nextStatus; } + /** + * Deregister {@link AgentAttache} from the {@link AgentManagerImpl}.
+ * Unlink {@link AgentManagerImpl#handleDisconnectWithoutInvestigation(AgentAttache, Event, boolean, boolean)}, + * this method does not perform full Host disconnect process with calling + * + * @param attache {@link AgentAttache} + * @param event {@link Event} + * @return true if {@link AgentAttache} was deregistered. + */ + protected boolean handleDeregisterAttache(AgentAttache attache, Event event) { + logger.info("Attache {} is deregistering with event {}", attache, event); + boolean isForForward = Optional.ofNullable(attache).filter(AgentAttache::forForward).isPresent(); + if (isForForward) { + long hostId = attache.getId(); + Status nextStatus = Optional.ofNullable(_hostDao.findById(hostId)) + .map(host -> getNextStatusOnDisconnection(host, event)) + .orElseGet(() -> { + Status status = Status.Removed; + logger.warn(String.format("Can't find host with id %d (%s), use status %s", hostId, attache, + status)); + return status; + }); + + /* Remove from ping map first before processing other monitors. This is to ensure we do not submit + same disconnects multiple times in scenarios when system is overloaded and when a particular + monitor's process disconnect takes too much time. + */ + _pingMap.remove(hostId); + logger.debug("Removed agent attache from ping map to prevent multiple disconnects: {}", attache); + + logger.debug("Deregistering attache for {} with state {}", attache, nextStatus); + Optional.ofNullable(removeAgent(hostId, attache)) + .ifPresent(removedAttache -> removedAttache.disconnect(nextStatus)); + return true; + } else { + logger.warn("Attache {} is not for forward, skip", attache); + } + return false; + } + protected boolean handleDisconnectWithoutInvestigation(final AgentAttache attache, final Status.Event event, final boolean transitState, final boolean removeAgent) { final long hostId = attache.getId(); - final HostVO host = _hostDao.findById(hostId); boolean result = false; GlobalLock joinLock = getHostJoinLock(hostId); try { - if (!joinLock.lock(60)) { - logger.debug("Unable to acquire lock on host {} to process agent disconnection", Objects.toString(host, String.valueOf(hostId))); - return result; + logger.debug("Handling disconnect without investigation for {}", attache); + if (joinLock.lock(getTimeoutSec())) { + result = disconnectAgent(attache, event, transitState, hostId, joinLock); } - - logger.debug("Acquired lock on host {}, to process agent disconnection", Objects.toString(host, String.valueOf(hostId))); - disconnectHostAgent(attache, event, host, transitState, joinLock); - result = true; } finally { joinLock.releaseRef(); + logger.debug("Finished handling disconnect without investigation for {}", attache); } return result; } + private boolean disconnectAgent(AgentAttache attache, Event event, boolean transitState, long hostId, GlobalLock joinLock) { + try { + logger.info("Host {} is disconnecting with event {}", attache, event); + Status nextStatus; + Status currentStatus; + final HostVO host = _hostDao.findById(hostId); + if (host == null) { + logger.warn("Can't find host with id: {} ({})", hostId, attache); + currentStatus = null; + nextStatus = Status.Removed; + } else { + currentStatus = host.getStatus(); + nextStatus = getNextStatusOnDisconnection(host, event); + caService.purgeHostCertificate(host); + } + + // If Host has "alive" status, let's change it to Alert to let user know host is not Up + // but there is a change in the state, because in case of slow disconnect processing it may take + // a time to reflect the status. + // Ideally would like to see "Disconnecting" state of the Host for such case. + if (host != null && transitState && currentStatus != null && HOST_STATUSES_CONNECTED.contains(currentStatus)) { + logger.debug("Changing status of the host {} from {} to {} on event {}", host, currentStatus, nextStatus, event); + _hostDao.updateState(host.getState(), event, Status.Alert, host, host.getId()); + } + if (logger.isDebugEnabled()) { + logger.debug("Deregistering link for {} with state {}", attache, nextStatus); + } + removeAgent(attache, nextStatus); + + if (host != null && transitState) { + // update the state for host in DB as per the event + disconnectAgent(host, event, _nodeId); + } + } finally { + joinLock.unlock(); + } + return true; + } + private void disconnectHostAgent(final AgentAttache attache, final Status.Event event, final HostVO host, final boolean transitState, final GlobalLock joinLock) { try { logger.info("Host {} is disconnecting with event {}", attache, event); @@ -1230,15 +1604,16 @@ public Answer easySend(final Long hostId, final Command cmd) { } return answer; - + // FIXME: There are a lot of result != null checks in callers. Should this method trow exception? + // In case of null the Domain logic has no idea what is going on; otherwise it can do additional steps/validation. } catch (final AgentUnavailableException e) { - logger.warn(e.getMessage()); + logger.warn("Failed to send command {} to Host {}: {}", cmd.getClass().getSimpleName(), hostId, e.getLocalizedMessage(), e); return null; } catch (final OperationTimedoutException e) { - logger.warn("Operation timed out: {}", e.getMessage()); + logger.warn("Timed out to send command {} to Host {}: {}", cmd.getClass().getSimpleName(), hostId, e.getLocalizedMessage(), e); return null; } catch (final Exception e) { - logger.warn("Exception while sending", e); + logger.warn("Exception while sending command {} to Host {}: {}", cmd.getClass().getSimpleName(), hostId, e.getLocalizedMessage(), e); return null; } } @@ -1309,6 +1684,15 @@ public void notifyMonitorsOfRemovedHost(long hostId, long clusterId) { } } + /** + * Executes only {@link Event#AgentDisconnected} and {@link Event#ShutdownRequested} requests. + * + * @param hostId Host Id + * @param event {@link Event} + * @return {@link Boolean#TRUE} if request is successful or have been anything done (mostly useless as it does + * not reflect resource state change) + * @throws AgentUnavailableException + */ public boolean executeUserRequest(final long hostId, final Event event) throws AgentUnavailableException { if (event == Event.AgentDisconnected) { AgentAttache attache; @@ -1353,16 +1737,23 @@ protected AgentAttache createAttacheForConnect(final HostVO host, final Link lin return attache; } - private AgentAttache sendReadyAndGetAttache(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startupCmds) throws ConnectionException { + private AgentAttache sendReadyAndGetAttache(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startup) throws ConnectionException { AgentAttache attache; GlobalLock joinLock = getHostJoinLock(host.getId()); try { - if (!joinLock.lock(60)) { - throw new ConnectionException(true, String.format("Unable to acquire lock on host %s, to process agent connection", host)); + long processStart = System.currentTimeMillis(); + if (joinLock.lock(getTimeoutSec())) { + logProcessingStart(host, joinLock); + try { + updateReadyCommandWithMSList(host, ready, startup); + attache = createAndNotifyAttache(host, link, startup); + } finally { + logProcessingFinish(host, joinLock, processStart); + joinLock.unlock(); + } + } else { + throw createLockAcquisitionException(host, joinLock, processStart); } - - logger.debug("Acquired lock on host {}, to process agent connection", host); - attache = connectHostAgent(host, ready, link, startupCmds, joinLock); } finally { joinLock.releaseRef(); } @@ -1370,45 +1761,105 @@ private AgentAttache sendReadyAndGetAttache(HostVO host, ReadyCommand ready, Lin return attache; } - private AgentAttache connectHostAgent(HostVO host, ReadyCommand ready, Link link, StartupCommand[] startupCmds, GlobalLock joinLock) throws ConnectionException { - AgentAttache attache; - try { - final List agentMSHostList = new ArrayList<>(); - String lbAlgorithm = null; - if (startupCmds != null && startupCmds.length > 0) { - final String agentMSHosts = startupCmds[0].getMsHostList(); - if (StringUtils.isNotEmpty(agentMSHosts)) { - String[] msHosts = agentMSHosts.split("@"); - if (msHosts.length > 1) { - lbAlgorithm = msHosts[1]; - } - agentMSHostList.addAll(Arrays.asList(msHosts[0].split(","))); + private void logProcessingStart(HostVO host, GlobalLock joinLock) { + StringBuilder msgBuilder = getSummaryMsgBuilder("Processing Host connection started", + EventTypes.EVENT_HOST_RECONNECT, host.getUuid(), host.getName(), joinLock.getName(), null, null, null); + logger.info(msgBuilder.toString()); + } + + private void logProcessingFinish(HostVO host, GlobalLock joinLock, long processStart) { + long processFinish = System.currentTimeMillis() - processStart; + StringBuilder msgBuilder = getSummaryMsgBuilder("Processing Host connection finished", + EventTypes.EVENT_HOST_RECONNECT, host.getUuid(), host.getName(), joinLock.getName(), null, null, + processFinish); + logger.fatal(msgBuilder.toString()); + } + + private void updateReadyCommandWithMSList(HostVO host, ReadyCommand ready, StartupCommand[] startup) { + List agentMSHostList = new ArrayList<>(); + String lbAlgorithm = null; + + if (startup != null) { + String agentMSHosts = startup[0].getMsHostList(); + if (StringUtils.isNotEmpty(agentMSHosts)) { + String[] msHosts = agentMSHosts.split("@"); + if (msHosts.length > 1) { + lbAlgorithm = msHosts[1]; } + agentMSHostList.addAll(Arrays.asList(msHosts[0].split(","))); } + } - if (!indirectAgentLB.compareManagementServerListAndLBAlgorithm(host.getId(), host.getDataCenterId(), agentMSHostList, lbAlgorithm)) { - final List newMSList = indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(), null); - ready.setMsHostList(newMSList); - String newLBAlgorithm = indirectAgentLB.getLBAlgorithmName(); - ready.setLbAlgorithm(newLBAlgorithm); - logger.debug("Agent's management server host list or lb algorithm is not up to date, sending list and algorithm update: {}, {}", newMSList, newLBAlgorithm); - } + if (!indirectAgentLB.compareManagementServerListAndLBAlgorithm(host.getId(), host.getDataCenterId(), agentMSHostList, lbAlgorithm)) { + setReadyCommandMSList(host, ready); + } + } + + private void setReadyCommandMSList(HostVO host, ReadyCommand ready) { + List newMSList = indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(), null); + ready.setMsHostList(newMSList); + List avoidMsList = _mshostDao.listNonUpStateMsIPs(); + ready.setAvoidMsHostList(avoidMsList); + ready.setLbAlgorithm(indirectAgentLB.getLBAlgorithmName()); + ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId())); + logger.debug("Agent's management server host list is not up to date, sending list update:" + newMSList); + } - final List avoidMsList = _mshostDao.listNonUpStateMsIPs(); - ready.setAvoidMsHostList(avoidMsList); - ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId())); - ready.setArch(host.getArch().getType()); + private AgentAttache createAndNotifyAttache(HostVO host, Link link, StartupCommand[] startup) throws ConnectionException { + AgentAttache attache = createAttacheForConnect(host, link); + return notifyMonitorsOfConnection(attache, startup, false); + } - attache = createAttacheForConnect(host, link); - attache = notifyMonitorsOfConnection(attache, startupCmds, false); - } finally { - joinLock.unlock(); + private ConnectionException createLockAcquisitionException(HostVO host, GlobalLock joinLock, long processStart) { + long processFinish = System.currentTimeMillis() - processStart; + String title = "Failed to connect Host"; + String summary = String.format("Failure due to unable to acquire lock %s during %s seconds", + joinLock.getName(), 60); + + StringBuilder msgBuilder = getSummaryMsgBuilder(title, EventTypes.EVENT_HOST_RECONNECT, host.getUuid(), + host.getName(), joinLock.getName(), summary, null, processFinish); + + ConnectionException ex = new ConnectionException(true, String.format("Unable to acquire lock on host %s", + host)); + logger.fatal(msgBuilder.toString(), ex); + + return ex; + } + + private StringBuilder getSummaryMsgBuilder(String title, String eventType, String hostUuid, String hostName, String lockName, String details, String monitorClassName, Long durationMs) { + StringBuilder msgBuilder = new StringBuilder(); + msgBuilder.append("\n================================================================================\n\n"); + msgBuilder.append(title).append(":\n\n"); + msgBuilder.append(String.format("event_name=%s\n", eventType)); + msgBuilder.append(String.format("host_name=%s\n", hostName)); + msgBuilder.append(String.format("host_uuid=%s\n", hostUuid)); + if (org.apache.commons.lang3.StringUtils.isNotEmpty(lockName)) { + msgBuilder.append(String.format("lock_name=%s\n", lockName)); + } + msgBuilder.append(String.format("ms_id=%s\n", _nodeId)); + if (org.apache.commons.lang3.StringUtils.isNotEmpty(details)) { + msgBuilder.append(details).append("\n"); + } + if (org.apache.commons.lang3.StringUtils.isNotEmpty(monitorClassName)) { + msgBuilder.append(String.format("monitor_class=%s\n", monitorClassName)); } + if (durationMs != null) { + msgBuilder.append(String.format("process_duration=%s\n", DateUtil.formatMillis(durationMs))); + msgBuilder.append(String.format("process_duration_ms=%s\n", durationMs)); + } + msgBuilder.append("\n\n================================================================================\n"); + return msgBuilder; + } - return attache; + /** + * Generates timeout for {@link GlobalLock#lock(int)} in seconds + * using {@link AgentManagerImpl#BASE_TIMEOUT_SEC} and {@link AgentManagerImpl#JITTER_TIMEOUT_SEC}. + */ + private int getTimeoutSec() { + return BASE_TIMEOUT_SEC + _random.nextInt(2 * JITTER_TIMEOUT_SEC + 1) - JITTER_TIMEOUT_SEC; } - private AgentAttache handleConnectedAgent(final Link link, final StartupCommand[] startup) { + private AgentAttache handleConnectedAgent(Link link, StartupCommand[] startup, Request request) { AgentAttache attache = null; ReadyCommand ready = null; try { @@ -1419,6 +1870,7 @@ private AgentAttache handleConnectedAgent(final Link link, final StartupCommand[ attache = sendReadyAndGetAttache(host, ready, link, startup); } } catch (final Exception e) { + // FIXME: Agent is already disconnected, there is no place where to send the command logger.debug("Failed to handle host connection: ", e); ready = new ReadyCommand(null); ready.setDetails(e.toString()); @@ -1430,8 +1882,14 @@ private AgentAttache handleConnectedAgent(final Link link, final StartupCommand[ try { if (attache == null) { - final Request readyRequest = new Request(-1, -1, ready, false); - link.send(readyRequest.getBytes()); + if (!link.isTerminated()) { + // FIXME: it does not make any sense here as link already closed + final Request readyRequest = new Request(-1, _nodeId, ready, false); + readyRequest.logD("Sending "); + link.send(readyRequest.getBytes()); + } else { + logger.warn("Not sending any request to the agent as link is terminated"); + } } else { easySend(attache.getId(), ready); } @@ -1493,6 +1951,9 @@ protected void runInContext() { } } + /** + * Handles Agent {@link Status#Connecting} process after receiving {@link StartupRoutingCommand} from the Agent. + */ protected class HandleAgentConnectTask extends ManagedContextRunnable { Link _link; Command[] _cmds; @@ -1506,13 +1967,13 @@ protected class HandleAgentConnectTask extends ManagedContextRunnable { @Override protected void runInContext() { - _request.logD("Processing the first command "); + _request.logD("Processing the first command"); final StartupCommand[] startups = new StartupCommand[_cmds.length]; for (int i = 0; i < _cmds.length; i++) { startups[i] = (StartupCommand)_cmds[i]; } - final AgentAttache attache = handleConnectedAgent(_link, startups); + AgentAttache attache = handleConnectedAgent(_link, startups, _request); if (attache == null) { logger.warn("Unable to create attache for agent: {}", _request); } @@ -1521,24 +1982,26 @@ protected void runInContext() { } protected void connectAgent(final Link link, final Command[] cmds, final Request request) { - // send startupanswer to agent in the very beginning, so agent can move on without waiting for the answer for an undetermined time, if we put this logic into another - // thread pool. - final StartupAnswer[] answers = new StartupAnswer[cmds.length]; + // send startup answer to agent in the very beginning, so agent can move on without waiting for the answer for an undetermined time, + // if we put this logic into another thread pool. + Map backoffConfiguration = ConfigKeyUtil.toMap(BackoffConfiguration.value()); + StartupAnswer[] answers = new StartupAnswer[cmds.length]; Command cmd; for (int i = 0; i < cmds.length; i++) { cmd = cmds[i]; - if (cmd instanceof StartupRoutingCommand || cmd instanceof StartupProxyCommand || cmd instanceof StartupSecondaryStorageCommand || - cmd instanceof StartupStorageCommand) { - answers[i] = new StartupAnswer((StartupCommand) cmds[i], 0, "", "", mgmtServiceConf.getPingInterval()); - break; + if (cmd instanceof StartupRoutingCommand || cmd instanceof StartupProxyCommand || cmd instanceof StartupSecondaryStorageCommand + || cmd instanceof StartupStorageCommand) { + StartupAnswer answer = new StartupAnswer((StartupCommand) cmds[i], 0, "", "", mgmtServiceConf.getPingInterval()); + answer.setParams(backoffConfiguration); + answer.setAgentHostStatusCheckDelaySec(AgentHostStatusCheckDelay.value()); + answers[i] = answer; } } - Response response; - response = new Response(request, answers[0], _nodeId, -1); + Response response = new Response(request, answers[0], _nodeId, -1); try { link.send(response.toBytes()); - } catch (final ClosedChannelException e) { - logger.debug("Failed to send startupanswer: {}", e.toString()); + } catch (ClosedChannelException e) { + logger.debug("Failed to send startup answer: {}", e.getMessage(), e); } _connectExecutor.execute(new HandleAgentConnectTask(link, cmds, request)); } @@ -1585,6 +2048,25 @@ private void processStartupRoutingCommand(StartupRoutingCommand startup, long ho processHostHealthCheckResult(hostHealthCheckResult, hostId); } + /** + * Process out of band {@link PingCommand} and return back "avoid MS list" + * and request to reconnect (instead of throwing away). + */ + private void processPingCommand(Link link, PingCommand cmd, Request request) { + if (cmd.getContextParam("logid") != null) { + ThreadContext.put("logcontextid", cmd.getContextParam("logid")); + } + PingAnswer answer = new PingAnswer(cmd, getAvoidMsList(), true); + Response response = new Response(request, new Answer[]{answer}, _nodeId, cmd.getHostId()); + response.setSequence(request.getSequence()); + response.logD("Processing:", true); + try { + link.send(response.toBytes()); + } catch (final ClosedChannelException e) { + logger.warn("Unable to send response because connection is closed: " + response, e); + } + } + private void processPingRoutingCommand(PingRoutingCommand pingRoutingCommand, long hostId) { if (pingRoutingCommand == null) { logger.error("Empty PingRoutingCommand received"); @@ -1600,13 +2082,25 @@ protected void processRequest(final Link link, final Request request) { Command cmd = cmds[0]; boolean logD = true; + if (cmd !=null && cmd.getContextParam("logid") != null) { + ThreadContext.put("logcontextid", cmd.getContextParam("logid")); + } + if (attache == null) { - if (!(cmd instanceof StartupCommand)) { - logger.warn("Throwing away a request because it came through as the first command on a connect: {}", request); - } else { + // FIXME: if there are more than one command and first is not startup command, + // then startup command will never be processed as expected + if (cmd instanceof StartupCommand) { // submit the task for execution request.logD("Scheduling the first command "); connectAgent(link, cmds, request); + } else if (cmd instanceof AgentConnectStatusCommand) { + processAgentConnectStatusCommand(link, (AgentConnectStatusCommand) cmd, request); + } else if (cmd instanceof PingCommand) { + processPingCommand(link, (PingCommand) cmd, request); + // FIXME: ShutdownCommand observed as "throwing away..." more than 4000 times +// } else if (cmd instanceof ShutdownCommand) { + } else { + logger.warn("Throwing away a request because it came through as the first command on a connect: {}", request); } return; } else if (cmd instanceof StartupCommand) { @@ -1619,11 +2113,11 @@ protected void processRequest(final Link link, final Request request) { if (cmd instanceof PingRoutingCommand) { logD = false; logger.debug("Ping from Routing host {}", attache); - logger.trace("SeqA {}-{}: Processing {}", hostId, request.getSequence(), request); + logger.debug("SeqA {}-{}: Processing {}", hostId, request.getSequence(), request); } else if (cmd instanceof PingCommand) { logD = false; logger.debug("Ping from {}", attache); - logger.trace("SeqA {}-{}: Processing {}", hostId, request.getSequence(), request); + logger.debug("SeqA {}-{}: Processing {}", hostId, request.getSequence(), request); } else { logger.debug("SeqA {}-{}: {}", hostId, request.getSequence(), request); } @@ -1658,12 +2152,13 @@ protected void processRequest(final Link link, final Request request) { } return; } else if (cmd instanceof AgentControlCommand) { - answer = handleControlCommand(attache, (AgentControlCommand)cmd); + answer = handleControlCommand(attache, (AgentControlCommand) cmd); + } else if (cmd instanceof AgentConnectStatusCommand) { + answer = handleAgentConnectStatusCommand(attache, (AgentConnectStatusCommand) cmd); } else { handleCommands(attache, request.getSequence(), new Command[] {cmd}); if (cmd instanceof PingCommand) { final long cmdHostId = ((PingCommand)cmd).getHostId(); - boolean requestStartupCommand = false; final HostVO host = _hostDao.findById(cmdHostId); boolean gatewayAccessible = true; @@ -1690,10 +2185,12 @@ protected void processRequest(final Link link, final Request request) { logger.debug("Not processing {} for agent id={}; can't find the host in the DB", PingRoutingCommand.class.getSimpleName(), cmdHostId); } } - if (host != null && host.getStatus() != Status.Up && gatewayAccessible) { - requestStartupCommand = true; + + boolean requestStartupCommand = false; + if (host != null && gatewayAccessible) { + requestStartupCommand = sendRequestStartupCommand(hostId, host);; } - final List avoidMsList = _mshostDao.listNonUpStateMsIPs(); + final List avoidMsList = getAvoidMsList(); answer = new PingAnswer((PingCommand)cmd, avoidMsList, requestStartupCommand); // Add or update reconcile tasks @@ -1724,8 +2221,46 @@ protected void processRequest(final Link link, final Request request) { try { link.send(response.toBytes()); } catch (final ClosedChannelException e) { - logger.error("Unable to send response because connection is closed: {}", response); + logger.error("Unable to send response because connection is closed: {}", response, e); + } + } + + /** + * Determines whether a startup command should be requested from the agent based on host status. + *

+ * This method checks if the host's current status is considered "alive" according to the configured + * alive host statuses. If the host status is not in the alive list and the host join lock is available, + * it indicates that a startup command should be requested to re-establish proper host connectivity. + * The lock check prevents conflicts when the host is already in the process of connecting or reconnecting. + *

+ * + * @param hostId the unique identifier of the host + * @param host the host entity containing current status information + * @return true if a startup command should be requested from the agent, false otherwise + * + * @see #getAliveHostStatuses() + * @see GlobalLock#isLockAvailable(String) + */ + private boolean sendRequestStartupCommand(long hostId, HostVO host) { + boolean requestStartup = false; + Set aliveStatuses = getAliveHostStatuses(); + boolean lockAvailable = GlobalLock.isLockAvailable(getHostJoinLockName(hostId)); + if (host.getStatus() == null || !aliveStatuses.contains(host.getStatus())) { + if (!lockAvailable) { + logger.debug("Ping from host {}: requesting startup command " + + "due to host status ({}) is not considered as alive ({}), " + + "however the lock is not available, skipping", host, host.getStatus(), aliveStatuses); + } else { + requestStartup = true; + logger.debug("Ping from host {}: requesting startup command " + + "due to host status ({}) is not considered as alive ({})", host, host.getStatus(), aliveStatuses); + } } + return requestStartup; + } + + private List getAvoidMsList() { + return _mshostDao.listNonUpStateMsIPs(); } protected void processResponse(final Link link, final Response response) { @@ -1737,6 +2272,40 @@ protected void processResponse(final Link link, final Response response) { } } + private void processAgentConnectStatusCommand(Link link, AgentConnectStatusCommand cmd, Request request) { + request.logD("Processing:", true); + Long hostId = cmd.getHostId(); + // first try to find Host by id + HostVO hostVo = Optional.ofNullable(hostId) + .filter(id -> id > 0) + .map(_hostDao::findById) + // fallback to Host guid + .orElseGet(() -> Optional.ofNullable(cmd.getHostGuid()) + .map(_hostDao::findByGuid) + // then finally by name + .orElseGet(() -> Optional.ofNullable(cmd.getHostName()) + .map(_hostDao::findByName) + .orElse(null))); + + AgentConnectStatusAnswer answer; + if (hostVo != null) { + answer = getConnectStatusAnswer(hostVo, cmd); + } else { + String details = String.format("Unable to find Host in DB with id %s guid %s name %s", hostId, + cmd.getHostGuid(), cmd.getHostName()); + answer = getAgentConnectStatusAnswer(cmd, "N/A", "N/A", true, Status.Unknown, details); + } + + Response response = new Response(request, new Answer[]{answer}, _nodeId, Optional.ofNullable(hostId).orElse(-1L)); + response.setSequence(request.getSequence()); + response.logD("Processing:", true); + try { + link.send(response.toBytes()); + } catch (final ClosedChannelException e) { + logger.warn("Unable to send response because connection is closed: " + response); + } + } + @Override protected void doTask(final Task task) throws TaskExecutionException { try (TransactionLegacy txn = TransactionLegacy.open(TransactionLegacy.CLOUD_DB)) { @@ -1752,13 +2321,15 @@ protected void doTask(final Task task) throws TaskExecutionException { } } catch (final UnsupportedVersionException e) { logger.warn(e.getMessage()); - } catch (final ClassNotFoundException e) { + } catch (final Exception e) { final String message = String.format("Exception occurred when executing tasks! Error '%s'", e.getMessage()); logger.error(message); throw new TaskExecutionException(message, e); } } else if (type == Type.CONNECT) { + logger.debug("Received connect"); } else if (type == Type.DISCONNECT) { + logger.debug("Received disconnect"); final Link link = task.getLink(); final AgentAttache attache = (AgentAttache) link.attachment(); if (attache != null) { @@ -2108,10 +2679,12 @@ public String getConfigComponentName() { @Override public ConfigKey[] getConfigKeys() { - return new ConfigKey[] { CheckTxnBeforeSending, Workers, Port, Wait, AlertWait, DirectAgentLoadSize, + return new ConfigKey[]{CheckTxnBeforeSending, Workers, Port, Wait, AlertWait, DirectAgentLoadSize, DirectAgentPoolSize, DirectAgentThreadCap, EnableKVMAutoEnableDisable, ReadyCommandWait, GranularWaitTimeForCommands, RemoteAgentSslHandshakeTimeout, RemoteAgentMaxConcurrentNewConnections, - RemoteAgentNewConnectionsMonitorInterval, KVMHostDiscoverySshPort }; + RemoteAgentNewConnectionsMonitorInterval, KVMHostDiscoverySshPort, BackoffConfiguration, + AgentHostStatusCheckDelay, AliveHostStatuses, PerformFullDisconnectOnAgentDisconnectEventBroadcast + }; } protected class SetHostParamsListener implements Listener { @@ -2187,6 +2760,28 @@ public int getTimeout() { } + protected class MsCache { + + private final Cache> msCache; + private ManagementServerHostDao msHostDao; + + public MsCache(ManagementServerHostDao msHostDao, int cacheSize, int cacheDurationMin) { + this.msHostDao = msHostDao; + msCache = Caffeine.newBuilder() + .maximumSize(cacheSize) + .expireAfterWrite(Duration.ofMinutes(cacheDurationMin)) + .build(); + } + + public String get(String ipAddress) { + return msCache.get("msHosts", key -> msHostDao.findAllIncludingRemoved() + .stream() + .collect(Collectors.toMap(ManagementServerHostVO::getServiceIP, + ManagementServerHostVO::getName))) + .get(ipAddress); + } + } + protected Map> getHostsPerZone() { List allHosts = _resourceMgr.listAllHostsInAllZonesByType(Host.Type.Routing); if (allHosts == null) { @@ -2254,7 +2849,17 @@ public int getHostSshPort(HostVO host) { } private GlobalLock getHostJoinLock(Long hostId) { - return GlobalLock.getInternLock(String.format("%s-%s", "Host-Join", hostId)); + String lockName = getHostJoinLockName(hostId); + logger.debug("Obtaining internal lock for {}", lockName); + try { + return GlobalLock.getInternLock(lockName); + } finally { + logger.debug("Obtained internal lock for {}", lockName); + } + } + + private String getHostJoinLockName(Long hostId) { + return String.format("%s-%s", "Host-Join", hostId); } public boolean isReconcileCommandsEnabled(HypervisorType hypervisorType) { diff --git a/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java b/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java index cfa0949883fd..127cb340a1ca 100644 --- a/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java +++ b/engine/orchestration/src/main/java/com/cloud/agent/manager/ClusteredAgentManagerImpl.java @@ -121,6 +121,11 @@ public class ClusteredAgentManagerImpl extends AgentManagerImpl implements Clust protected HashMap _peers; protected HashMap _sslEngines; private final Timer _timer = new Timer("ClusteredAgentManager Timer"); + /** + * State flag to ensure Agent load-balancing performed only when Management Server started (once) + * or when Management Server goes back from {@link ManagementServerHost.State#Maintenance} + * (or {@link ManagementServerHost.State#PreparingForMaintenance}). + */ boolean _agentLbHappened = false; private int _mshostCounter = 0; @@ -285,8 +290,9 @@ protected AgentAttache createAttache(final HostVO host) { _agents.put(host.getId(), attache); } if (old != null) { - logger.debug("Remove stale agent attache from current management server"); - removeAgent(old, Status.Removed); + logger.debug("Remove stale agent attache from current management server {}", _nodeId); + // just remove agent but do not deinitialize + removeAgent(old.getId(), attache); } return attache; } @@ -350,6 +356,16 @@ protected boolean handleDisconnect(final AgentAttache agent, final Status.Event } } + /** + * Overrides {@link Event#AgentDisconnected} logic and falls back to + * {@link AgentManagerImpl#executeUserRequest(long, Event)} for other event types ({@link Event#ShutdownRequested}). + * + * @param hostId Host Id + * @param event {@link Event} + * @return {@link Boolean#TRUE} if request is successful or have been anything done (mostly useless as it does + * not reflect resource state change) + * @throws AgentUnavailableException + */ @Override public boolean executeUserRequest(final long hostId, final Event event) throws AgentUnavailableException { if (event == Event.AgentDisconnected) { @@ -369,6 +385,8 @@ public boolean executeUserRequest(final long hostId, final Event event) throws A } } + // preload value before validating attache for forward to avoid race condition in case of configuration retrieval slowness + boolean performFullDisconnectOnAgentDisconnectEventBroadcast = PerformFullDisconnectOnAgentDisconnectEventBroadcast.value(); // don't process disconnect if the disconnect came for the host via delayed cluster notification, // but the host has already reconnected to the current management server if (!attache.forForward()) { @@ -378,7 +396,19 @@ public boolean executeUserRequest(final long hostId, final Event event) throws A return true; } - return super.handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, false, true); + // If we received an AgentDisconnected event and we are here, + // it means the current server holds a forward attachment for an already disconnected host. + // We could either run full disconnect again (if the feature flag is "true"), + // or skip that and just deregister attache. + if (performFullDisconnectOnAgentDisconnectEventBroadcast) { + logger.debug("Processing {} event for the forward attache of host [id: {}, uuid: {}, name: {}]", + Event.AgentDisconnected, hostId, attache.getUuid(), attache.getName()); + return super.handleDisconnectWithoutInvestigation(attache, Event.AgentDisconnected, false, true); + } else { + logger.debug("Processing {} event (deregistering agent only) for the forward attache of host [id: {}, uuid: {}, name: {}]", + Event.AgentDisconnected, hostId, attache.getUuid(), attache.getName()); + return super.handleDeregisterAttache(attache, Event.AgentDisconnected); + } } return true; @@ -400,7 +430,8 @@ public void reconnect(final long hostId) throws CloudRuntimeException, AgentUnav } public void notifyNodesInCluster(final AgentAttache attache) { - logger.debug("Notifying other nodes of to disconnect"); + // this code will send ChangeAgentCommand to all instances in the cluster + logger.debug("Notifying other nodes of to disconnect agent {} ({})", attache.getId(), attache.getName()); final Command[] cmds = new Command[]{new ChangeAgentCommand(attache.getId(), Event.AgentDisconnected)}; _clusterMgr.broadcast(attache.getId(), _gson.toJson(cmds)); } @@ -581,7 +612,8 @@ protected AgentAttache getAttache(final Long hostId) throws AgentUnavailableExce AgentAttache agent = findAttache(hostId); if (agent == null || !agent.forForward()) { if (isHostOwnerSwitched(host)) { - logger.debug("{} has switched to another management server, need to update agent map with a forwarding agent attache", host); + logger.debug("Host {} has switched (from {}) to another management server ({}), " + + "need to update agent map with a forwarding agent attache", host, _nodeId, host.getManagementServerId()); agent = createAttache(host); } } @@ -754,15 +786,6 @@ public void onManagementNodeLeft(final List node public void onManagementNodeIsolated() { } - @Override - public void removeAgent(final AgentAttache attache, final Status nextState) { - if (attache == null) { - return; - } - - super.removeAgent(attache, nextState); - } - @Override public boolean executeRebalanceRequest(final long agentId, final long currentOwnerId, final long futureOwnerId, final Event event) throws AgentUnavailableException, OperationTimedoutException { return executeRebalanceRequest(agentId, currentOwnerId, futureOwnerId, event, false); @@ -873,11 +896,13 @@ public void startRebalanceAgents() { transfer = _hostTransferDao.startAgentTransfering(hostId, node.getMsid(), _nodeId); final Answer[] answer = sendRebalanceCommand(node.getMsid(), hostId, node.getMsid(), _nodeId); if (answer == null) { - logger.warn("Failed to get host {} from management server {}", host, node); + logger.warn("Failed to get host {} from management server {} to {}", host, node, _nodeId); result = false; + } else { + logger.debug("Succeeded to get host {} from management server {} to {}", host, node, _nodeId); } } catch (final Exception ex) { - logger.warn("Failed to get host {} from management server {}", host, node, ex); + logger.warn("Failed to get host {} from management server {} to {}", host, node, _nodeId, ex); result = false; } finally { if (transfer != null) { @@ -910,13 +935,15 @@ private Answer[] sendRebalanceCommand(final long peer, final long agentId, final final Command[] cmds = commands.toCommands(); try { - logger.debug("Forwarding {} to {}", cmds[0].toString(), peer); + logger.debug("Forwarding host {} from {} to {} as part of {} - cmd: {}, peer: {}", + agentId, currentOwnerId, futureOwnerId, event.name(), cmds[0].toString(), peer); final String peerName = Long.toString(peer); final String cmdStr = _gson.toJson(cmds); final String ansStr = _clusterMgr.execute(peerName, agentId, cmdStr, true); return _gson.fromJson(ansStr, Answer[].class); } catch (final Exception e) { - logger.warn("Caught exception while talking to {}", currentOwnerId, e); + logger.warn("Caught exception during forwarding host {} from {} to {} as part of {}", + agentId, currentOwnerId, futureOwnerId, event.name(), e); return null; } } @@ -1269,7 +1296,7 @@ public String dispatch(final ClusterServicePdu pdu) { logger.debug("Intercepting command for agent change: agent {} event: {}", cmd.getAgentId(), cmd.getEvent()); boolean result; try { - result = executeAgentUserRequest(cmd.getAgentId(), cmd.getEvent()); + result = executeUserRequest(cmd.getAgentId(), cmd.getEvent()); logger.debug("Result is {}", result); } catch (final AgentUnavailableException e) { @@ -1511,7 +1538,7 @@ public void onManagementServerPreparingForMaintenance() { @Override public void onManagementServerCancelPreparingForMaintenance() { logger.debug("Management server cancel preparing for maintenance"); - super.onManagementServerPreparingForMaintenance(); + super.onManagementServerCancelPreparingForMaintenance(); // needed for the case when Management Server in Preparing For Maintenance but didn't go to Maintenance state // (where this variable will be reset) @@ -1541,14 +1568,6 @@ public void onManagementServerCancelMaintenance() { } } - public boolean executeAgentUserRequest(final long agentId, final Event event) throws AgentUnavailableException { - return executeUserRequest(agentId, event); - } - - public boolean rebalanceAgent(final long agentId, final Event event, final long currentOwnerId, final long futureOwnerId) throws AgentUnavailableException, OperationTimedoutException { - return executeRebalanceRequest(agentId, currentOwnerId, futureOwnerId, event); - } - public boolean rebalanceAgent(final long agentId, final Event event, final long currentOwnerId, final long futureOwnerId, boolean isConnectionTransfer) throws AgentUnavailableException, OperationTimedoutException { return executeRebalanceRequest(agentId, currentOwnerId, futureOwnerId, event, isConnectionTransfer); } @@ -1557,6 +1576,12 @@ public boolean isAgentRebalanceEnabled() { return EnableLB.value(); } + /** + * Returns Agent Rebalancing Task. + * Runs on timer, but expects to be executed only when Management Server started (once) + * or when Management Server goes back from {@link ManagementServerHost.State#Maintenance} + * (or {@link ManagementServerHost.State#PreparingForMaintenance}). + */ private Runnable getAgentRebalanceScanTask() { return new ManagedContextRunnable() { @Override diff --git a/engine/orchestration/src/test/java/com/cloud/agent/manager/AgentManagerImplTest.java b/engine/orchestration/src/test/java/com/cloud/agent/manager/AgentManagerImplTest.java index 43d83a672c0f..0d7889d583b0 100644 --- a/engine/orchestration/src/test/java/com/cloud/agent/manager/AgentManagerImplTest.java +++ b/engine/orchestration/src/test/java/com/cloud/agent/manager/AgentManagerImplTest.java @@ -21,6 +21,7 @@ import com.cloud.agent.api.ReadyCommand; import com.cloud.agent.api.StartupCommand; import com.cloud.agent.api.StartupRoutingCommand; +import com.cloud.exception.AgentUnavailableException; import com.cloud.exception.ConnectionException; import com.cloud.host.Host; import com.cloud.host.HostVO; @@ -28,15 +29,30 @@ import com.cloud.host.dao.HostDao; import com.cloud.hypervisor.Hypervisor; import com.cloud.utils.Pair; +import com.cloud.utils.nio.Link; +import org.apache.cloudstack.framework.config.ConfigKey; +import org.apache.commons.lang3.reflect.FieldUtils; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.mockito.Mockito; +import java.lang.reflect.Method; import java.util.ArrayList; +import java.util.Set; +import java.util.UUID; + +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.when; public class AgentManagerImplTest { + private static final Long HOST_ID = 1L; + private static final String HOST_UUID = UUID.randomUUID().toString(); + private static final String HOST_NAME = "test-host-name"; + private static final Link LINK = new Link(null, null); + private static final Status.Event EVENT = Status.Event.AgentDisconnected; + private HostDao hostDao; private Listener storagePoolMonitor; private AgentAttache attache; @@ -47,9 +63,10 @@ public class AgentManagerImplTest { @Before public void setUp() throws Exception { host = new HostVO("some-Uuid"); + FieldUtils.writeField(host, "id", HOST_ID, true); host.setDataCenterId(1L); cmds = new StartupCommand[]{new StartupRoutingCommand()}; - attache = new ConnectedAgentAttache(null, 1L, "uuid", "kvm-attache", Hypervisor.HypervisorType.KVM, null, false); + attache = createClusterAttache(false); hostDao = Mockito.mock(HostDao.class); storagePoolMonitor = Mockito.mock(Listener.class); @@ -59,13 +76,18 @@ public void setUp() throws Exception { mgr._hostMonitors.add(new Pair<>(0, storagePoolMonitor)); } + private AgentAttache createClusterAttache(boolean forForward) { + Link link = forForward ? null : LINK; + return new ClusteredAgentAttache(mgr, HOST_ID, HOST_UUID, HOST_NAME, Hypervisor.HypervisorType.KVM, link, false); + } + @Test public void testNotifyMonitorsOfConnectionNormal() throws ConnectionException { - Mockito.when(hostDao.findById(Mockito.anyLong())).thenReturn(host); - Mockito.doNothing().when(storagePoolMonitor).processConnect(Mockito.eq(host), Mockito.eq(cmds[0]), Mockito.eq(false)); + when(hostDao.findById(Mockito.anyLong())).thenReturn(host); + Mockito.doNothing().when(storagePoolMonitor).processConnect(eq(host), eq(cmds[0]), eq(false)); Mockito.doReturn(true).when(mgr).handleDisconnectWithoutInvestigation(Mockito.any(attache.getClass()), Mockito.any(Status.Event.class), Mockito.anyBoolean(), Mockito.anyBoolean()); Mockito.doReturn(Mockito.mock(Answer.class)).when(mgr).easySend(Mockito.anyLong(), Mockito.any(ReadyCommand.class)); - Mockito.doReturn(true).when(mgr).agentStatusTransitTo(Mockito.eq(host), Mockito.eq(Status.Event.Ready), Mockito.anyLong()); + Mockito.doReturn(true).when(mgr).agentStatusTransitTo(eq(host), eq(Status.Event.Ready), Mockito.anyLong()); final AgentAttache agentAttache = mgr.notifyMonitorsOfConnection(attache, cmds, false); Assert.assertTrue(agentAttache.isReady()); // Agent is in UP state @@ -74,8 +96,8 @@ public void testNotifyMonitorsOfConnectionNormal() throws ConnectionException { @Test public void testNotifyMonitorsOfConnectionWhenStoragePoolConnectionHostFailure() throws ConnectionException { ConnectionException connectionException = new ConnectionException(true, "storage pool could not be connected on host"); - Mockito.when(hostDao.findById(Mockito.anyLong())).thenReturn(host); - Mockito.doThrow(connectionException).when(storagePoolMonitor).processConnect(Mockito.eq(host), Mockito.eq(cmds[0]), Mockito.eq(false)); + when(hostDao.findById(Mockito.anyLong())).thenReturn(host); + Mockito.doThrow(connectionException).when(storagePoolMonitor).processConnect(eq(host), eq(cmds[0]), eq(false)); Mockito.doReturn(true).when(mgr).handleDisconnectWithoutInvestigation(Mockito.any(attache.getClass()), Mockito.any(Status.Event.class), Mockito.anyBoolean(), Mockito.anyBoolean()); try { mgr.notifyMonitorsOfConnection(attache, cmds, false); @@ -83,7 +105,7 @@ public void testNotifyMonitorsOfConnectionWhenStoragePoolConnectionHostFailure() } catch (ConnectionException e) { Assert.assertEquals(e.getMessage(), connectionException.getMessage()); } - Mockito.verify(mgr, Mockito.times(1)).handleDisconnectWithoutInvestigation(Mockito.any(attache.getClass()), Mockito.eq(Status.Event.AgentDisconnected), Mockito.eq(true), Mockito.eq(true)); + Mockito.verify(mgr, Mockito.times(1)).handleDisconnectWithoutInvestigation(Mockito.any(attache.getClass()), eq(Status.Event.AgentDisconnected), eq(true), eq(true)); } @Test @@ -106,6 +128,128 @@ public void testGetTimeoutWithGranularTimeout() { Assert.assertEquals(50, result); } + @Test + public void testAliveHostStatusesConfigKey() { + ConfigKey configKey = mgr.AliveHostStatuses; + Assert.assertNotNull("Config key should not be null", configKey); + Assert.assertEquals("Config key should have correct key name", "alive.host.statuses", configKey.key()); + Assert.assertEquals("Config key should have correct default value", "Up,Creating,Connecting,Rebalancing", configKey.defaultValue()); + Assert.assertEquals("Config key should have correct category", "Advanced", configKey.category()); + } + + @Test + public void testGetConfigKeysIncludesAliveHostStatuses() { + AgentManagerImpl agentManager = new AgentManagerImpl(); + ConfigKey[] configKeys = agentManager.getConfigKeys(); + boolean found = false; + for (ConfigKey configKey : configKeys) { + if ("alive.host.statuses".equals(configKey.key())) { + found = true; + break; + } + } + Assert.assertTrue("AliveHostStatuses should be included in getConfigKeys()", found); + } + + @Test(expected = AgentUnavailableException.class) + public void testHandleDeregisterAttacheInAlertState() throws Exception { + FieldUtils.writeField(host, "status", Status.Alert, true); + attache = createClusterAttache(true); + when(hostDao.findById(eq(HOST_ID))).thenReturn(host); + + boolean result = mgr.handleDeregisterAttache(attache, EVENT); + Assert.assertTrue(result); + + mgr.getAttache(HOST_ID); + } + + @Test + public void testHandleDeregisterAttacheInAlertStateAnotherAttacheIsPresent() throws Exception { + FieldUtils.writeField(host, "status", Status.Alert, true); + attache = createClusterAttache(true); + when(hostDao.findById(eq(HOST_ID))).thenReturn(host); + + AgentAttache preExistingAttache = mgr.createAttacheForConnect(host, LINK); + + boolean result = mgr.handleDeregisterAttache(attache, EVENT); + Assert.assertTrue(result); + + // ensure pre-existing attache remains in place + AgentAttache testAttache = mgr.getAttache(HOST_ID); + Assert.assertEquals(preExistingAttache, testAttache); + } + + @Test + public void testHandleDeregisterAttacheInUpState() throws IllegalAccessException { + FieldUtils.writeField(host, "status", Status.Up, true); + attache = createClusterAttache(true); + when(hostDao.findById(eq(HOST_ID))).thenReturn(host); + + boolean result = mgr.handleDeregisterAttache(attache, EVENT); + Assert.assertTrue(result); + } + + @Test + public void testHandleDeregisterAttacheHostDoesNotExist() { + attache = createClusterAttache(true); + when(hostDao.findById(eq(HOST_ID))).thenReturn(null); + + boolean result = mgr.handleDeregisterAttache(attache, EVENT); + Assert.assertTrue(result); + } + + @Test + public void testHandleDeregisterAttacheNotForward() { + attache = createClusterAttache(false); + boolean result = mgr.handleDeregisterAttache(attache, EVENT); + Assert.assertFalse(result); + } + + @Test + public void testGetAliveHostStatusesDefaultConfiguration() throws Exception { + ConfigKey mockAliveHostStatuses = Mockito.mock(ConfigKey.class); + when(mockAliveHostStatuses.value()).thenReturn("Up"); + + FieldUtils.writeField(mgr, "AliveHostStatuses", mockAliveHostStatuses, true); + + Method initializeMethod = AgentManagerImpl.class.getDeclaredMethod("initializeAliveHostStatuses"); + initializeMethod.setAccessible(true); + initializeMethod.invoke(mgr); + + Method getAliveHostStatusesMethod = AgentManagerImpl.class.getDeclaredMethod("getAliveHostStatuses"); + getAliveHostStatusesMethod.setAccessible(true); + + @SuppressWarnings("unchecked") + Set result = (Set) getAliveHostStatusesMethod.invoke(mgr); + + Assert.assertEquals(1, result.size()); + Assert.assertTrue(result.contains(Status.Up)); + } + + @Test + public void testGetAliveHostStatusesExpandedConfiguration() throws Exception { + ConfigKey mockAliveHostStatuses = Mockito.mock(ConfigKey.class); + when(mockAliveHostStatuses.value()).thenReturn("Up,Alert,Connecting,Creating,Rebalancing"); + FieldUtils.writeField(mgr, "AliveHostStatuses", mockAliveHostStatuses, true); + + Method initializeMethod = AgentManagerImpl.class.getDeclaredMethod("initializeAliveHostStatuses"); + initializeMethod.setAccessible(true); + initializeMethod.invoke(mgr); + + Method getAliveHostStatusesMethod = AgentManagerImpl.class.getDeclaredMethod("getAliveHostStatuses"); + getAliveHostStatusesMethod.setAccessible(true); + + @SuppressWarnings("unchecked") + Set result = (Set) getAliveHostStatusesMethod.invoke(mgr); + + Assert.assertEquals(5, result.size()); + Assert.assertTrue(result.contains(Status.Up)); + Assert.assertTrue(result.contains(Status.Alert)); + Assert.assertTrue(result.contains(Status.Connecting)); + Assert.assertTrue(result.contains(Status.Creating)); + Assert.assertTrue(result.contains(Status.Rebalancing)); + } + @Test public void testGetHostSshPortWithHostNull() { int hostSshPort = mgr.getHostSshPort(null); @@ -115,7 +259,7 @@ public void testGetHostSshPortWithHostNull() { @Test public void testGetHostSshPortWithNonKVMHost() { HostVO host = Mockito.mock(HostVO.class); - Mockito.when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.XenServer); + when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.XenServer); int hostSshPort = mgr.getHostSshPort(host); Assert.assertEquals(22, hostSshPort); } @@ -123,8 +267,8 @@ public void testGetHostSshPortWithNonKVMHost() { @Test public void testGetHostSshPortWithKVMHostDefaultPort() { HostVO host = Mockito.mock(HostVO.class); - Mockito.when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.KVM); - Mockito.when(host.getClusterId()).thenReturn(1L); + when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.KVM); + when(host.getClusterId()).thenReturn(1L); int hostSshPort = mgr.getHostSshPort(host); Assert.assertEquals(22, hostSshPort); } @@ -132,8 +276,8 @@ public void testGetHostSshPortWithKVMHostDefaultPort() { @Test public void testGetHostSshPortWithKVMHostCustomPort() { HostVO host = Mockito.mock(HostVO.class); - Mockito.when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.KVM); - Mockito.when(host.getDetail(Host.HOST_SSH_PORT)).thenReturn(String.valueOf(3922)); + when(host.getHypervisorType()).thenReturn(Hypervisor.HypervisorType.KVM); + when(host.getDetail(Host.HOST_SSH_PORT)).thenReturn(String.valueOf(3922)); int hostSshPort = mgr.getHostSshPort(host); Assert.assertEquals(3922, hostSshPort); } diff --git a/engine/orchestration/src/test/java/com/cloud/agent/manager/ClusteredAgentManagerImplTest.java b/engine/orchestration/src/test/java/com/cloud/agent/manager/ClusteredAgentManagerImplTest.java index 5e4678f62225..4a7c9d02656f 100644 --- a/engine/orchestration/src/test/java/com/cloud/agent/manager/ClusteredAgentManagerImplTest.java +++ b/engine/orchestration/src/test/java/com/cloud/agent/manager/ClusteredAgentManagerImplTest.java @@ -18,16 +18,24 @@ package com.cloud.agent.manager; import com.cloud.configuration.ManagementServiceConfiguration; +import com.cloud.exception.AgentUnavailableException; import com.cloud.ha.HighAvailabilityManagerImpl; import com.cloud.host.HostVO; import com.cloud.host.Status; import com.cloud.host.dao.HostDao; import com.cloud.resource.ResourceManagerImpl; +import com.cloud.utils.db.GlobalLock; +import org.apache.cloudstack.framework.config.ConfigKey; +import org.apache.commons.lang3.reflect.FieldUtils; +import org.junit.After; +import org.junit.Assert; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.mockito.Mock; +import org.mockito.MockedStatic; import org.mockito.Mockito; +import org.mockito.Spy; import org.mockito.junit.MockitoJUnitRunner; import java.util.ArrayList; @@ -35,29 +43,86 @@ import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.anyBoolean; +import static org.mockito.ArgumentMatchers.anyInt; import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.doReturn; +import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; @RunWith(MockitoJUnitRunner.class) public class ClusteredAgentManagerImplTest { + private static final Long HOST_ID = 1L; + private static final Status.Event EVENT = Status.Event.AgentDisconnected; - private HostDao _hostDao; @Mock - ManagementServiceConfiguration _mgmtServiceConf; + ManagementServiceConfiguration mgmtServiceConf; + + @Spy + private ClusteredAgentManagerImpl mgr = new ClusteredAgentManagerImpl(); + @Mock + private HostDao hostDao; + @Mock + private ConfigKey PerformFullDisconnectOnAgentDisconnectEventBroadcast; + private GlobalLock hostJoinLockMock; + private HostVO host; + + MockedStatic globalLockMocked; @Before public void setUp() throws Exception { - _hostDao = mock(HostDao.class); + host = new HostVO("some-Uuid"); + FieldUtils.writeField(host, "id", HOST_ID, true); + + mgr._hostDao = hostDao; + mgr.PerformFullDisconnectOnAgentDisconnectEventBroadcast = PerformFullDisconnectOnAgentDisconnectEventBroadcast; + + mgr.createAttache(host); + globalLockMocked = Mockito.mockStatic(GlobalLock.class); + hostJoinLockMock = mock(GlobalLock.class); + } + + @After + public void tearDown() throws Exception { + globalLockMocked.close(); + } + + @Test + public void testFullDisconnectPerformed() throws AgentUnavailableException { + when(PerformFullDisconnectOnAgentDisconnectEventBroadcast.value()).thenReturn(Boolean.TRUE); + globalLockMocked.when(() -> GlobalLock.getInternLock(anyString())).thenReturn(hostJoinLockMock); + when(hostJoinLockMock.lock(anyInt())).thenReturn(Boolean.TRUE); + + boolean result = mgr.executeUserRequest(HOST_ID, EVENT); + Assert.assertTrue(result); + + GlobalLock.getInternLock(anyString()); + + verify(hostJoinLockMock, times(1)).lock(anyInt()); + } + + @Test + public void testDeregisterOnlyPerformed() throws AgentUnavailableException { + lenient().when(PerformFullDisconnectOnAgentDisconnectEventBroadcast.value()).thenReturn(Boolean.FALSE); + lenient().when(hostJoinLockMock.lock(anyInt())).thenReturn(Boolean.TRUE); + + boolean result = mgr.executeUserRequest(HOST_ID, EVENT); + Assert.assertTrue(result); + + GlobalLock.getInternLock(anyString()); + + // we do not expect any lock to be called as lock happens only in handleDisconnectWithoutInvestigation + verify(hostJoinLockMock, never()).lock(anyInt()); } @Test public void scanDirectAgentToLoadNoHostsTest() { ClusteredAgentManagerImpl clusteredAgentManagerImpl = mock(ClusteredAgentManagerImpl.class); - clusteredAgentManagerImpl._hostDao = _hostDao; + clusteredAgentManagerImpl._hostDao = hostDao; clusteredAgentManagerImpl.scanDirectAgentToLoad(); verify(clusteredAgentManagerImpl, never()).findAttache(anyLong()); verify(clusteredAgentManagerImpl, never()).loadDirectlyConnectedHost(any(), anyBoolean()); @@ -68,15 +133,14 @@ public void scanDirectAgentToLoadHostWithoutAttacheTest() { // Arrange ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(ClusteredAgentManagerImpl.class); HostVO hostVO = mock(HostVO.class); - clusteredAgentManagerImpl._hostDao = _hostDao; - clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf; + clusteredAgentManagerImpl._hostDao = hostDao; + clusteredAgentManagerImpl.mgmtServiceConf = mgmtServiceConf; clusteredAgentManagerImpl._resourceMgr = mock(ResourceManagerImpl.class); - when(_mgmtServiceConf.getTimeout()).thenReturn(16000L); + when(mgmtServiceConf.getTimeout()).thenReturn(16000L); when(hostVO.getId()).thenReturn(1L); - List hosts = new ArrayList<>(); + List hosts = new ArrayList<>(); hosts.add(hostVO); - when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); - AgentAttache agentAttache = mock(AgentAttache.class); + when(hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); doReturn(Boolean.TRUE).when(clusteredAgentManagerImpl).loadDirectlyConnectedHost(hostVO, false); clusteredAgentManagerImpl.scanDirectAgentToLoad(); verify(clusteredAgentManagerImpl).loadDirectlyConnectedHost(hostVO, false); @@ -86,13 +150,13 @@ public void scanDirectAgentToLoadHostWithoutAttacheTest() { public void scanDirectAgentToLoadHostWithForwardAttacheTest() { ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(ClusteredAgentManagerImpl.class); HostVO hostVO = mock(HostVO.class); - clusteredAgentManagerImpl._hostDao = _hostDao; - clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf; - when(_mgmtServiceConf.getTimeout()).thenReturn(16000L); + clusteredAgentManagerImpl._hostDao = hostDao; + clusteredAgentManagerImpl.mgmtServiceConf = mgmtServiceConf; + when(mgmtServiceConf.getTimeout()).thenReturn(16000L); when(hostVO.getId()).thenReturn(1L); - List hosts = new ArrayList<>(); + List hosts = new ArrayList<>(); hosts.add(hostVO); - when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); + when(hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); AgentAttache agentAttache = mock(AgentAttache.class); when(agentAttache.forForward()).thenReturn(Boolean.TRUE); when(clusteredAgentManagerImpl.findAttache(1L)).thenReturn(agentAttache); @@ -106,14 +170,14 @@ public void scanDirectAgentToLoadHostWithNonForwardAttacheTest() { // Arrange ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(new ClusteredAgentManagerImpl()); HostVO hostVO = mock(HostVO.class); - clusteredAgentManagerImpl._hostDao = _hostDao; - clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf; + clusteredAgentManagerImpl._hostDao = hostDao; + clusteredAgentManagerImpl.mgmtServiceConf = mgmtServiceConf; clusteredAgentManagerImpl._haMgr = mock(HighAvailabilityManagerImpl.class); - when(_mgmtServiceConf.getTimeout()).thenReturn(16000L); + when(mgmtServiceConf.getTimeout()).thenReturn(16000L); when(hostVO.getId()).thenReturn(0L); - List hosts = new ArrayList<>(); + List hosts = new ArrayList<>(); hosts.add(hostVO); - when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); + when(hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); AgentAttache agentAttache = mock(AgentAttache.class); when(agentAttache.forForward()).thenReturn(Boolean.FALSE); @@ -130,15 +194,15 @@ public void scanDirectAgentToLoadHostWithNonForwardAttacheTest() { public void scanDirectAgentToLoadHostWithNonForwardAttacheAndDisconnectedTest() { ClusteredAgentManagerImpl clusteredAgentManagerImpl = Mockito.spy(ClusteredAgentManagerImpl.class); HostVO hostVO = mock(HostVO.class); - clusteredAgentManagerImpl._hostDao = _hostDao; - clusteredAgentManagerImpl.mgmtServiceConf = _mgmtServiceConf; + clusteredAgentManagerImpl._hostDao = hostDao; + clusteredAgentManagerImpl.mgmtServiceConf = mgmtServiceConf; clusteredAgentManagerImpl._haMgr = mock(HighAvailabilityManagerImpl.class); clusteredAgentManagerImpl._resourceMgr = mock(ResourceManagerImpl.class); - when(_mgmtServiceConf.getTimeout()).thenReturn(16000L); + when(mgmtServiceConf.getTimeout()).thenReturn(16000L); when(hostVO.getId()).thenReturn(0L); - List hosts = new ArrayList<>(); + List hosts = new ArrayList<>(); hosts.add(hostVO); - when(_hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); + when(hostDao.findAndUpdateDirectAgentToLoad(anyLong(), anyLong(), anyLong())).thenReturn(hosts); AgentAttache agentAttache = mock(AgentAttache.class); when(agentAttache.forForward()).thenReturn(Boolean.FALSE); when(clusteredAgentManagerImpl.findAttache(0L)).thenReturn(agentAttache); diff --git a/engine/schema/src/main/resources/META-INF/db/procedures/cloud.idempotent_add_index.sql b/engine/schema/src/main/resources/META-INF/db/procedures/cloud.idempotent_add_index.sql new file mode 100644 index 000000000000..56fc7ae1b375 --- /dev/null +++ b/engine/schema/src/main/resources/META-INF/db/procedures/cloud.idempotent_add_index.sql @@ -0,0 +1,28 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- in cloud +DROP PROCEDURE IF EXISTS `cloud`.`IDEMPOTENT_ADD_INDEX`; + +CREATE PROCEDURE `cloud`.`IDEMPOTENT_ADD_INDEX` ( + IN in_index_name VARCHAR(200) +,IN in_table_name VARCHAR(200) +,IN in_index_definition VARCHAR(1000) +) +BEGIN + + DECLARE CONTINUE HANDLER FOR 1061 BEGIN END; SET @ddl = CONCAT('ALTER TABLE ', in_table_name, ' ADD INDEX ', in_index_name, ' ', in_index_definition, ', ALGORITHM=INPLACE, LOCK=NONE'); PREPARE stmt FROM @ddl; EXECUTE stmt; DEALLOCATE PREPARE stmt; END; diff --git a/engine/schema/src/main/resources/META-INF/db/schema-42210to42300.sql b/engine/schema/src/main/resources/META-INF/db/schema-42210to42300.sql index 4cb9eb7cb2c4..d68062780d70 100644 --- a/engine/schema/src/main/resources/META-INF/db/schema-42210to42300.sql +++ b/engine/schema/src/main/resources/META-INF/db/schema-42210to42300.sql @@ -117,3 +117,17 @@ CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.vpc_offerings','conserve_mode', 'tin --- Disable/enable NICs CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.nics','enabled', 'TINYINT(1) NOT NULL DEFAULT 1 COMMENT ''Indicates whether the NIC is enabled or not'' '); + +-- these indexes created from top 20 resource consuming DB queries +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_mshost_removed_state','cloud.mshost', '(removed, state)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_host_id_removed','cloud.host', '(id, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_networks_id_removed','cloud.networks', '(id, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_alert_type_data_center_id_archived_pod_id','cloud.alert', '(type, data_center_id, archived, pod_id)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_configuration_name','cloud.configuration', '(name)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_account_id_removed','cloud.account', '(id, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_async_job_id_removed','cloud.async_job', '(id, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_user_id_api_key_removed','cloud.user', '(id, api_key, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_resource_limit_type_domain_id_tag','cloud.resource_limit', '(type, domain_id, tag)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_vm_instance_id_removed','cloud.vm_instance', '(id, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_vm_host_state_removed','cloud.vm_instance', '(host_id, state, removed)'); +CALL `cloud`.`IDEMPOTENT_ADD_INDEX`('idx_vm_instance_name_removed','cloud.vm_instance', '(instance_name, removed)'); diff --git a/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletHttpHandler.java b/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletHttpHandler.java index 4e94f434f8f8..e59d4ace0324 100644 --- a/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletHttpHandler.java +++ b/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletHttpHandler.java @@ -19,12 +19,14 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URLDecoder; +import java.util.Optional; import org.apache.http.HttpEntityEnclosingRequest; import org.apache.http.HttpException; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; +import org.apache.http.RequestLine; import org.apache.http.entity.BasicHttpEntity; import org.apache.http.protocol.HttpContext; import org.apache.http.protocol.HttpRequestHandler; @@ -73,10 +75,11 @@ public void handle(HttpRequest request, HttpResponse response, HttpContext conte @SuppressWarnings("deprecation") private void parseRequest(HttpRequest request) throws IOException { + String body = null; if (request instanceof HttpEntityEnclosingRequest) { final HttpEntityEnclosingRequest entityRequest = (HttpEntityEnclosingRequest)request; - final String body = EntityUtils.toString(entityRequest.getEntity()); + body = EntityUtils.toString(entityRequest.getEntity()); if (body != null) { final String[] paramArray = body.split("&"); if (paramArray != null) { @@ -97,6 +100,7 @@ private void parseRequest(HttpRequest request) throws IOException { } } } + logRequest(request, body); } private void writeResponse(HttpResponse response, int statusCode, String content) { @@ -113,6 +117,14 @@ private void writeResponse(HttpResponse response, int statusCode, String content response.setEntity(body); } + private void logRequest(HttpRequest request, String requestBody) { + Optional requestOpt = Optional.ofNullable(request); + Optional requestLineOpt = requestOpt.map(HttpRequest::getRequestLine); + String method = requestLineOpt.map(RequestLine::getMethod).orElse(null); + String uri = requestLineOpt.map(RequestLine::getUri).orElse(null); + logger.debug("{} {} {}", method, uri, requestBody); + } + protected void handleRequest(HttpRequest req, HttpResponse response) { final String method = (String)req.getParams().getParameter("method"); diff --git a/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletImpl.java b/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletImpl.java index d582538c31e0..3871c4eb6e92 100644 --- a/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletImpl.java +++ b/framework/cluster/src/main/java/com/cloud/cluster/ClusterServiceServletImpl.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; import java.rmi.RemoteException; import java.security.GeneralSecurityException; import java.util.ArrayList; @@ -98,9 +99,10 @@ public String execute(final ClusterServicePdu pdu) throws RemoteException { try { method.setEntity(new UrlEncodedFormEntity(postParameters, HttpUtils.UTF_8)); } catch (UnsupportedEncodingException e) { - logger.error("Failed to encode request POST parameters", e); + String msg = "Failed to encode request POST parameters: " + postParameters; + logger.error(msg, e); logPostParametersForFailedEncoding(postParameters); - throw new RemoteException("Failed to encode request POST parameters", e); + throw new RemoteException(msg, e); } return executePostMethod(client, method); @@ -126,9 +128,10 @@ public boolean ping(final String callingPeer) throws RemoteException { try { method.setEntity(new UrlEncodedFormEntity(postParameters, HttpUtils.UTF_8)); } catch (UnsupportedEncodingException e) { - logger.error("Failed to encode ping request POST parameters", e); + String msg = "Failed to encode ping request POST parameters: " + postParameters; + logger.error(msg, e); logPostParametersForFailedEncoding(postParameters); - throw new RemoteException("Failed to encode ping request POST parameters", e); + throw new RemoteException(msg, e); } final String returnVal = executePostMethod(client, method); @@ -137,6 +140,12 @@ public boolean ping(final String callingPeer) throws RemoteException { private String executePostMethod(final CloseableHttpClient client, final HttpPost method) { String result = null; + String request = null; + try { + request = EntityUtils.toString(method.getEntity(), Charset.defaultCharset()); + } catch (Exception e) { + logger.warn("Failed to retrieve request entity for POST {}", serviceUrl, e); + } try { final Profiler profiler = new Profiler(); profiler.start(); @@ -146,15 +155,15 @@ private String executePostMethod(final CloseableHttpClient client, final HttpPos result = EntityUtils.toString(httpResponse.getEntity()); profiler.stop(); if (logger.isDebugEnabled()) { - logger.debug("POST " + serviceUrl + " response :" + result + ", responding time: " + profiler.getDurationInMillis() + " ms"); + logger.debug("POST " + serviceUrl + " request: " + request + ", response :" + result + ", responding time: " + profiler.getDurationInMillis() + " ms"); } } else { profiler.stop(); - logger.error("Invalid response code : " + response + ", from : " + serviceUrl + ", method : " + method.getParams().getParameter("method") + " responding time: " + + logger.error("Invalid response code : " + response + ", from : " + serviceUrl + " request: " + request + ", method : " + method.getParams().getParameter("method") + " responding time: " + profiler.getDurationInMillis()); } } catch (IOException e) { - logger.error("Exception from : " + serviceUrl + ", method : " + method.getParams().getParameter("method") + ", exception :", e); + logger.error("Exception from : " + serviceUrl + " request: " + request + ", method : " + method.getParams().getParameter("method") + ", exception :", e); } finally { method.releaseConnection(); } diff --git a/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDao.java b/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDao.java index 6c8ffcac78b7..274e8833ff3a 100644 --- a/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDao.java +++ b/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDao.java @@ -29,6 +29,11 @@ public interface ManagementServerHostDao extends GenericDao findAllIncludingRemoved(); + ManagementServerHostVO findByMsid(long msid); int increaseAlertCount(long id); diff --git a/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDaoImpl.java b/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDaoImpl.java index ec943a9c26be..ceb9464c6219 100644 --- a/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDaoImpl.java +++ b/framework/cluster/src/main/java/com/cloud/cluster/dao/ManagementServerHostDaoImpl.java @@ -64,6 +64,11 @@ public void invalidateRunSession(long id, long runid) { } } + @Override + public List findAllIncludingRemoved() { + return listIncludingRemovedBy(ActiveSearch.create()); + } + @Override public ManagementServerHostVO findByMsid(long msid) { SearchCriteria sc = MsIdSearch.create(); diff --git a/framework/config/src/main/java/org/apache/cloudstack/framework/config/ConfigKeyUtil.java b/framework/config/src/main/java/org/apache/cloudstack/framework/config/ConfigKeyUtil.java new file mode 100644 index 000000000000..e8ae35cf5377 --- /dev/null +++ b/framework/config/src/main/java/org/apache/cloudstack/framework/config/ConfigKeyUtil.java @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.cloudstack.framework.config; + +import org.apache.commons.lang3.StringUtils; + +import java.util.Arrays; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +/** + * Utility class that helps with configuration key manipulation. + * + * @author mprokopchuk + */ +public final class ConfigKeyUtil { + + /** + * Split by {@code ;} with optional space symbols (space, tab, new line, etc.) before and after. + */ + private static Pattern ENTRY_SEPARATOR_PATTERN = Pattern.compile("\\s*;\\s*"); + /** + * Split by {@code =} with optional space symbols (space, tab, new line, etc.) before and after. + */ + private static Pattern KEY_VALUE_SEPARATOR_PATTERN = Pattern.compile("\\s*=\\s*"); + + private ConfigKeyUtil() { + } + + /** + * Convert configuration value of format {@code key1=value1;key2=value2;...} to {@link Map}. + * + * @param configValue configuration value string + * @return configuration values map + */ + public static Map toMap(String configValue) { + if (StringUtils.isEmpty(configValue)) { + return Map.of(); + } + + return Arrays.stream(ENTRY_SEPARATOR_PATTERN.split(configValue)) + .map(pair -> KEY_VALUE_SEPARATOR_PATTERN.split(pair, 2)) + .filter(keyValue -> keyValue.length == 2) + .collect(Collectors.toMap(keyValue -> keyValue[0], keyValue -> keyValue[1])); + } +} diff --git a/framework/db/src/main/java/com/cloud/utils/db/DbUtil.java b/framework/db/src/main/java/com/cloud/utils/db/DbUtil.java index 88397f54d4f4..59dab18e3f1a 100644 --- a/framework/db/src/main/java/com/cloud/utils/db/DbUtil.java +++ b/framework/db/src/main/java/com/cloud/utils/db/DbUtil.java @@ -51,11 +51,16 @@ public class DbUtil { private static Map s_connectionForGlobalLocks = new HashMap(); + /** + * @param name lock name + * @param forLock true to add lock and false to remove + * @return {@link Connection} + */ public static Connection getConnectionForGlobalLocks(String name, boolean forLock) { synchronized (s_connectionForGlobalLocks) { if (forLock) { if (s_connectionForGlobalLocks.get(name) != null) { - LOGGER.error("Sanity check failed, global lock name " + name + " is already in use"); + LOGGER.error("Sanity check failed, global lock name {} is already in use", name); assert (false); } @@ -64,15 +69,18 @@ public static Connection getConnectionForGlobalLocks(String name, boolean forLoc try { connection.setAutoCommit(true); } catch (SQLException e) { - closeAutoCloseable(connection, "error closing connection for global locks"); + closeAutoCloseable(connection, String.format("error closing connection for global lock %s", name)); return null; } + LOGGER.debug("Storing connection for global lock {}", name); s_connectionForGlobalLocks.put(name, connection); return connection; } return null; } else { + // remove connection from references map, expecting it will be disposed later (and lock removed) Connection connection = s_connectionForGlobalLocks.get(name); + LOGGER.debug("Removing DB connection for global lock {}", name); s_connectionForGlobalLocks.remove(name); return connection; } @@ -80,9 +88,7 @@ public static Connection getConnectionForGlobalLocks(String name, boolean forLoc } public static void removeConnectionForGlobalLocks(String name) { - synchronized (s_connectionForGlobalLocks) { - s_connectionForGlobalLocks.remove(name); - } + getConnectionForGlobalLocks(name, false); } public static String getColumnName(Field field, AttributeOverride[] overrides) { @@ -199,7 +205,7 @@ public static final String getTableName(Class clazz) { public static boolean getGlobalLock(String name, int timeoutSeconds) { Connection conn = getConnectionForGlobalLocks(name, true); if (conn == null) { - LOGGER.error("Unable to acquire DB connection for global lock system"); + LOGGER.error("Unable to acquire DB connection for global lock {}", name); return false; } @@ -210,24 +216,54 @@ public static boolean getGlobalLock(String name, int timeoutSeconds) { try (ResultSet rs = pstmt.executeQuery();) { if (rs != null && rs.first()) { if (rs.getInt(1) > 0) { + LOGGER.debug("GET_LOCK() for global lock {} succeeded", name); return true; } else { - if (LOGGER.isDebugEnabled()) - LOGGER.debug("GET_LOCK() timed out on lock : " + name); + LOGGER.debug("GET_LOCK() for global lock {} is timed out", name); } } } - } catch (SQLException e) { - LOGGER.error("GET_LOCK() throws exception ", e); } catch (Throwable e) { - LOGGER.error("GET_LOCK() throws exception ", e); + LOGGER.error("GET_LOCK() for global lock {} throws exception", name, e); } removeConnectionForGlobalLocks(name); - closeAutoCloseable(conn, "connection for global lock"); + closeAutoCloseable(conn, String.format("connection for global lock %s", name)); return false; } + public static boolean isFreeLock(String name) { + boolean result = false; + Connection conn = TransactionLegacy.getStandaloneConnection(); + if (conn == null) { + LOGGER.error("Unable to acquire DB connection for IS_FREE_LOCK('{}'), returning default " + + "value {}", name, result); + return result; + } + try { + conn.setAutoCommit(true); + } catch (SQLException e) { + closeAutoCloseable(conn, String.format("error closing connection for IS_FREE_LOCK('%s'), returning " + + "default value %s", name, result)); + return result; + } + + try (PreparedStatement pstmt = conn.prepareStatement("SELECT COALESCE(IS_FREE_LOCK(?),0)");) { + pstmt.setString(1, name); + try (ResultSet rs = pstmt.executeQuery();) { + if (rs.next()) { + result = rs.getInt(1) > 0; + } + } + LOGGER.debug("IS_FREE_LOCK('{}') returned: {}", name, result); + } catch (Throwable e) { + LOGGER.error("IS_FREE_LOCK('{}') threw exception {}", name, e.getClass(), e); + } finally { + closeAutoCloseable(conn, String.format("connection for IS_FREE_LOCK('%s')", name)); + } + return result; + } + public static Class getEntityBeanType(GenericDao dao) { return dao.getEntityBeanType(); } @@ -235,7 +271,7 @@ public static Class getEntityBeanType(GenericDao dao) { public static boolean releaseGlobalLock(String name) { try (Connection conn = getConnectionForGlobalLocks(name, false);) { if (conn == null) { - LOGGER.error("Unable to acquire DB connection for global lock system"); + LOGGER.error("Unable to obtain DB connection to release global lock {}", name); assert (false); return false; } @@ -244,15 +280,14 @@ public static boolean releaseGlobalLock(String name) { pstmt.setString(1, name); try (ResultSet rs = pstmt.executeQuery();) { if (rs != null && rs.first()) { + LOGGER.debug("RELEASE_LOCK() for global lock {} succeeded", name); return rs.getInt(1) > 0; } - LOGGER.error("releaseGlobalLock:RELEASE_LOCK() returns unexpected result"); + LOGGER.error("RELEASE_LOCK() for global lock {} returns unexpected result", name); } } - } catch (SQLException e) { - LOGGER.error("RELEASE_LOCK() throws exception ", e); } catch (Throwable e) { - LOGGER.error("RELEASE_LOCK() throws exception ", e); + LOGGER.error("RELEASE_LOCK() for global lock {} throws exception", name, e); } return false; } diff --git a/framework/db/src/main/java/com/cloud/utils/db/GlobalLock.java b/framework/db/src/main/java/com/cloud/utils/db/GlobalLock.java index b01ddaf6853a..f992d7a4dc0f 100644 --- a/framework/db/src/main/java/com/cloud/utils/db/GlobalLock.java +++ b/framework/db/src/main/java/com/cloud/utils/db/GlobalLock.java @@ -16,11 +16,9 @@ // under the License. package com.cloud.utils.db; -import static java.lang.String.format; - import java.util.HashMap; import java.util.Map; -import java.util.concurrent.Callable; +import java.util.Optional; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.LogManager; @@ -45,21 +43,58 @@ *

*/ public class GlobalLock { - protected Logger logger = LogManager.getLogger(getClass()); + protected final static Logger logger = LogManager.getLogger(GlobalLock.class); private String name; - private int lockCount = 0; - private Thread ownerThread = null; - - private int referenceCount = 0; - private long holdingStartTick = 0; - - private static Map s_lockMap = new HashMap(); + /** + * DB lock count. + * Increments on {@link GlobalLock#lock(int)} and decrements on {@link GlobalLock#unlock()}. + * Upon {@link GlobalLock#unlock()}, if {@link GlobalLock#lockCount} is less than 1, then lock removed from DB + */ + private int lockCount; + + /** + * Internal (in-memory) lock count. + * Increments on {@link GlobalLock#addRef()} and indirectly on {@link GlobalLock#getInternLock(String)} and + * decrements on {@link GlobalLock#releaseRef()}, {@link GlobalLock#unlock()} and on {@link GlobalLock#lock(int)} + * if DB lock is unsuccessful + */ + private int referenceCount; + + /** + * Thread that owns lock. If lock called from different thread, it will be waiting for the owner to unlock it + * within requested timeout. If owner thread call {@link GlobalLock#lock(int)} again, then + * {@link GlobalLock#lockCount} will be incremented. + * If {@link GlobalLock#unlock()} called by owner thread, or DB lock will be unsuccessful, then owner thread will be + * nullified. + */ + private Thread ownerThread; + + /** + * Variable to hold lock duration in milliseconds. Used for information only. + */ + private long holdingStartTick; + + /** + * Holds all created locks. + */ + private static Map s_lockMap = new HashMap<>(); + + /** + * Create lock. + * + * @param name lock name + */ private GlobalLock(String name) { this.name = name; } + /** + * Increment reference count to lock. + * + * @return reference count + */ public int addRef() { synchronized (this) { referenceCount++; @@ -67,54 +102,118 @@ public int addRef() { } } + /** + * Decrement reference count to lock. + * + * @return reference count + */ public int releaseRef() { - int refCount; - boolean needToRemove = false; synchronized (this) { + if (logger.isDebugEnabled()) { + logger.debug("Releasing reference for internal lock {}, reference count: {}, lock count: {}", + name, referenceCount, lockCount); + } referenceCount--; - refCount = referenceCount; - - if (referenceCount < 0) - logger.warn("Unmatched Global lock " + name + " reference usage detected, check your code!"); - if (referenceCount == 0) + if (referenceCount < 0) { + logger.warn("Unmatched internal lock {} reference usage detected (reference count: {}, " + + "lock count: {}), check your code!", name, referenceCount, lockCount); + } else if (referenceCount < 1) { needToRemove = true; + } } - if (needToRemove) + if (needToRemove) { + if (logger.isDebugEnabled()) { + logger.debug("Need to release internal lock {}", name); + } releaseInternLock(name); + } + if (logger.isDebugEnabled()) { + logger.debug("Released reference for lock {}, reference count: {}", name, referenceCount); + } + return referenceCount; + } - return refCount; + public static boolean isLockAvailable(String name) { + if (logger.isDebugEnabled()) { + logger.debug("Checking lock present for {}", name); + } + boolean result = false; + try { + result = DbUtil.isFreeLock(name); + } finally { + if (logger.isDebugEnabled()) { + logger.debug("Result of checking lock present for {}: {}", name, result); + } + } + return result; } + /** + * Registers internal lock (in memory) object. Does not create any lock in DB yet. + * + * @param name lock name + * @return lock object + */ public static GlobalLock getInternLock(String name) { synchronized (s_lockMap) { + GlobalLock lock; if (s_lockMap.containsKey(name)) { - GlobalLock lock = s_lockMap.get(name); - lock.addRef(); - return lock; + lock = s_lockMap.get(name); + if (logger.isDebugEnabled()) { + logger.debug("Internal lock {} already exists with reference count {} and lock count {}", + name, lock.referenceCount, lock.lockCount); + } } else { - GlobalLock lock = new GlobalLock(name); - lock.addRef(); + lock = new GlobalLock(name); + if (logger.isDebugEnabled()) { + logger.debug("Internal lock {} does not exist, adding", name); + } s_lockMap.put(name, lock); - return lock; } + lock.addRef(); + if (logger.isDebugEnabled()) { + logger.debug("Added reference to internal lock {}, reference count {}, lock count {}", + name, lock.referenceCount, lock.lockCount); + } + return lock; } } + /** + * Unregister internal lock (in memory) object. Does not remove any lock from DB. + * + * @param name lock name + */ private void releaseInternLock(String name) { synchronized (s_lockMap) { GlobalLock lock = s_lockMap.get(name); if (lock != null) { - if (lock.referenceCount == 0) + if (lock.referenceCount == 0) { + if (logger.isDebugEnabled()) { + logger.debug("Released internal lock {}", name); + } s_lockMap.remove(name); + } else { + if (logger.isDebugEnabled()) { + logger.debug("Not releasing internal lock {} as it has references count: {}, lock count: {}", + name, lock.referenceCount, lock.lockCount); + } + } } else { - logger.warn("Releasing " + name + ", but it is already released."); + logger.warn("Internal lock {} already released", name); } } } + /** + * Acquire or join existing DB lock. + * + * @param timeoutSeconds time in seconds during which lock needs to be obtained (it is not the lock duration) + * @return true if lock successfully obtained + */ public boolean lock(int timeoutSeconds) { int remainingMilliSeconds = timeoutSeconds * 1000; Profiler profiler = new Profiler(); @@ -122,50 +221,70 @@ public boolean lock(int timeoutSeconds) { try { while (true) { synchronized (this) { - if (ownerThread != null && ownerThread == Thread.currentThread()) { - logger.warn("Global lock re-entrance detected"); - + if (ownerThread == Thread.currentThread()) { + logger.warn("Global lock {} re-entrance detected, owner thread: {}, reference count: {}, " + + "lock count: {}", getThreadName(ownerThread), name, referenceCount, lockCount); + // if it is re-entrance, then we may have more lock counts than needed? lockCount++; - if (logger.isTraceEnabled()) - logger.trace("lock " + name + " is acquired, lock count :" + lockCount); + if (logger.isDebugEnabled()) { + logger.debug("Global lock {} joined, reference count: {}, lock count: {}", + name, referenceCount, lockCount); + } return true; - } - - if (ownerThread != null) { + } else if (ownerThread != null) { profiler.start(); try { - wait((timeoutSeconds) * 1000L); + logger.debug("Waiting {} seconds to acquire global lock {}", timeoutSeconds, name); + wait(timeoutSeconds * 1000L); } catch (InterruptedException e) { interrupted = true; } profiler.stop(); remainingMilliSeconds -= profiler.getDurationInMillis(); - if (remainingMilliSeconds < 0) + if (remainingMilliSeconds < 0) { + logger.warn("Timeout of {} seconds to acquire global lock {} has been reached, " + + "owner thread {}, reference count: {}, lock count: {}", timeoutSeconds, name, getThreadName(ownerThread), referenceCount, lockCount); return false; + } continue; } else { // take ownership temporarily to prevent others enter into stage of acquiring DB lock ownerThread = Thread.currentThread(); + // XXX: do we need it here (???) addRef(); + if (logger.isDebugEnabled()) { + logger.debug("Taking ownership on global lock {} to acquire, owner thread: {}, " + + "reference count: {}, lock count: {}", name, getThreadName(ownerThread), referenceCount, lockCount); + } } } - if (DbUtil.getGlobalLock(name, remainingMilliSeconds / 1000)) { + int remainingSeconds = remainingMilliSeconds / 1000; + if (logger.isDebugEnabled()) { + logger.debug("Acquiring global lock {} in DB within remaining {} seconds", name, remainingSeconds); + } + if (DbUtil.getGlobalLock(name, remainingSeconds)) { synchronized (this) { lockCount++; holdingStartTick = System.currentTimeMillis(); - if (logger.isTraceEnabled()) - logger.trace("lock " + name + " is acquired, lock count :" + lockCount); + if (logger.isDebugEnabled()) { + logger.debug("Global lock {} acquired, reference count: {}, lock count: {}", + name, referenceCount, lockCount); + } return true; } } else { synchronized (this) { ownerThread = null; releaseRef(); + if (logger.isDebugEnabled()) { + logger.debug("Failed to acquire global lock in DB {}, reference count: {}, " + + "lock count: {}", name, referenceCount, lockCount); + } return false; } } @@ -177,16 +296,31 @@ public boolean lock(int timeoutSeconds) { } } + private String getThreadName(Thread thread) { + return Optional.ofNullable(thread).map(Thread::getName).orElse("N/A"); + } + + /** + * Decrements lock count, decrements lock reference if no more locks left, and remove lock if no more references + * left. Does the job only if lock owned by current thread. + * + * @return true if lock is owned by current thread + */ public boolean unlock() { synchronized (this) { - if (ownerThread != null && ownerThread == Thread.currentThread()) { + if (logger.isDebugEnabled()) { + logger.debug("Unlock {}", name); + } + if (ownerThread == Thread.currentThread()) { lockCount--; - if (lockCount == 0) { + if (lockCount < 0) { ownerThread = null; - DbUtil.releaseGlobalLock(name); + boolean result = DbUtil.releaseGlobalLock(name); - if (logger.isTraceEnabled()) - logger.trace("lock " + name + " is returned to free state, total holding time :" + (System.currentTimeMillis() - holdingStartTick)); + if (logger.isDebugEnabled()) { + logger.debug("Global lock {} is returned to {} state, total holding time {} seconds", + name, result ? "successful" : "unsuccessful", (System.currentTimeMillis() - holdingStartTick) / 1000); + } holdingStartTick = 0; // release holding position in intern map when we released the DB connection @@ -194,8 +328,9 @@ public boolean unlock() { notifyAll(); } - if (logger.isTraceEnabled()) - logger.trace("lock " + name + " is released, lock count :" + lockCount); + if (logger.isDebugEnabled()) { + logger.debug("lock {} released, lock count: {}", name, lockCount); + } return true; } return false; @@ -205,36 +340,4 @@ public boolean unlock() { public String getName() { return name; } - - public T executeWithLock(final String operationId, final int lockAcquisitionTimeout, final Callable operation) throws Exception { - - final GlobalLock lock = GlobalLock.getInternLock(operationId); - - try { - - if (!lock.lock(lockAcquisitionTimeout)) { - if (logger.isDebugEnabled()) { - logger.debug(format("Failed to acquire lock for operation id %1$s", operationId)); - } - return null; - } - - return operation.call(); - - } finally { - - if (lock != null) { - lock.unlock(); - } - - } - - } - - public T executeWithNoWaitLock(final String operationId, final Callable operation) throws Exception { - - return executeWithLock(operationId, 0, operation); - - } - } diff --git a/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java b/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java index dc7b6282b080..946ccdf267b0 100644 --- a/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java +++ b/server/src/main/java/org/apache/cloudstack/agent/lb/IndirectAgentLBServiceImpl.java @@ -446,6 +446,7 @@ protected boolean migrateNonRoutingHostAgentsInZone(String fromMsUuid, long from break; } + // FIXME: it is fire and forget, Management Server will never know if task failed migrateAgentsExecutorService.submit(new MigrateAgentConnectionTask(fromMsId, hostId, dc.getId(), orderedHostIdList, avoidMsList, lbCheckInterval, lbAlgorithm, lbAlgorithmChanged)); } @@ -465,6 +466,7 @@ protected boolean migrateNonRoutingHostAgentsInZone(String fromMsUuid, long from logger.debug(String.format("Force shutdown migrate non-routing agents service as it did not shutdown in the desired time due to: %s", e.getMessage())); } + // FIXME: This task can fail only if it is timed out, otherwise it is always succeeds no matter what is the migration result. return true; } @@ -595,7 +597,9 @@ protected void runInContext() { msList = getManagementServerList(hostId, dcId, orderedHostIdList, lbAlgorithm); } + // ask Host to reconnect to another Management Server final MigrateAgentConnectionCommand cmd = new MigrateAgentConnectionCommand(msList, avoidMsList, lbAlgorithm, lbCheckInterval); + // timeout 1 minute (FIXME: should it be configurable?) cmd.setWait(60); final Answer answer = agentManager.easySend(hostId, cmd); //may not receive answer when the agent disconnects immediately and try reconnecting to other ms host if (answer == null) { diff --git a/utils/src/main/java/com/cloud/utils/DateUtil.java b/utils/src/main/java/com/cloud/utils/DateUtil.java index 00ae5565dadc..daec494c41e4 100644 --- a/utils/src/main/java/com/cloud/utils/DateUtil.java +++ b/utils/src/main/java/com/cloud/utils/DateUtil.java @@ -22,12 +22,15 @@ import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; +import java.time.Duration; import java.time.LocalDateTime; import java.time.ZoneId; import java.time.ZonedDateTime; import java.time.YearMonth; +import java.util.ArrayList; import java.util.Calendar; import java.util.Date; +import java.util.List; import java.util.TimeZone; import java.time.format.DateTimeFormatter; @@ -346,4 +349,34 @@ public static ZonedDateTime getZoneDateTime(Date date, ZoneId tzId) { public static int getHoursInCurrentMonth(Date date) { return YearMonth.of(date.getYear(), date.getMonth() + 1).lengthOfMonth() * 24; } + + /** + * Formats duration in milliseconds to string {@code X min, Y sec, Z ms}. + * + * @param durationMs duration in milliseconds + * @return formatted string + */ + public static String formatMillis(long durationMs) { + Duration duration = Duration.ofMillis(durationMs); + long minutes = duration.toMinutes(); + duration = duration.minusMinutes(minutes); + long seconds = duration.toSeconds(); + duration = duration.minusSeconds(seconds); + long millis = duration.toMillis(); + + List parts = new ArrayList<>(); + if (minutes > 0) { + parts.add(minutes + " min"); + } + if (seconds > 0) { + parts.add(seconds + " sec"); + } + if (millis > 0) { + parts.add(millis + " ms"); + } + if (parts.isEmpty()) { + parts.add("0 ms"); + } + return String.join(", ", parts); + } } diff --git a/utils/src/main/java/com/cloud/utils/LogUtils.java b/utils/src/main/java/com/cloud/utils/LogUtils.java index 654773ee0a09..0ed9054141e2 100644 --- a/utils/src/main/java/com/cloud/utils/LogUtils.java +++ b/utils/src/main/java/com/cloud/utils/LogUtils.java @@ -20,6 +20,8 @@ package com.cloud.utils; import java.io.File; +import java.net.InetAddress; +import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -106,4 +108,29 @@ public static String logGsonWithoutException(String formatMessage, Object ... ob return errorMsg; } } + + /** + * Generates address entry for log in format of {@code IP_ADDRESS/HOST_NAME:PORT}, where {@code HOST_NAME} is + * optional if it cannot be resolved. + * + * @param address IP address or Host name + * @param port port + */ + public static String getHostLog(String address, Integer port) { + try { + InetAddress inetAddress = InetAddress.getByName(address); + String hostName = inetAddress.getHostName(); + String ipAddress = inetAddress.getHostAddress(); + if (port == null) { + return String.format("%s/%s", ipAddress, hostName); + } + return String.format("%s/%s:%s", ipAddress, hostName, port); + } catch (UnknownHostException e) { + LOGGER.warn("Failed to resolve name for address {}", address, e); + } + if (port == null) { + return address; + } + return String.format("%s:%s", address, port); + } } diff --git a/utils/src/main/java/com/cloud/utils/backoff/BackoffAlgorithm.java b/utils/src/main/java/com/cloud/utils/backoff/BackoffAlgorithm.java index b8f3f0032921..90ea823a45fb 100644 --- a/utils/src/main/java/com/cloud/utils/backoff/BackoffAlgorithm.java +++ b/utils/src/main/java/com/cloud/utils/backoff/BackoffAlgorithm.java @@ -21,6 +21,8 @@ import com.cloud.utils.component.Adapter; +import java.util.Map; + /** * BackoffAlgorithm implements multiple BackoffAlgorithm. */ @@ -35,4 +37,5 @@ public interface BackoffAlgorithm extends Adapter { */ void reset(); + Map getConfiguration(); } diff --git a/utils/src/main/java/com/cloud/utils/backoff/BackoffFactory.java b/utils/src/main/java/com/cloud/utils/backoff/BackoffFactory.java new file mode 100644 index 000000000000..37950df82017 --- /dev/null +++ b/utils/src/main/java/com/cloud/utils/backoff/BackoffFactory.java @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.utils.backoff; + +import com.cloud.utils.backoff.impl.ConstantTimeBackoff; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import javax.naming.ConfigurationException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; +import java.util.stream.Collectors; + +/** + * Backoff implementation factory. + * + * @author mprokopchuk + */ +public interface BackoffFactory { + Logger logger = LogManager.getLogger(BackoffFactory.class); + /** + * Property name for the implementation class (that extends {@link BackoffAlgorithm}) to be used either + * by {@code agent.properties} file or by configuration key. + */ + String BACKOFF_IMPLEMENTATION_KEY = "backoff.implementation"; + + /** + * Default backoff implementation class name ({@link ConstantTimeBackoff}). + */ + String DEFAULT_BACKOFF_IMPLEMENTATION = ConstantTimeBackoff.class.getName(); + + /** + * Creates default {@link BackoffAlgorithm} implementation object ({@link ConstantTimeBackoff}). + * + * @param properties configuration properties + * @return {@link BackoffAlgorithm} implementation object + */ + static BackoffAlgorithm createDefault(Properties properties) { + Properties newProperties = new Properties(properties); + newProperties.put(BACKOFF_IMPLEMENTATION_KEY, DEFAULT_BACKOFF_IMPLEMENTATION); + return create(newProperties); + } + + /** + * Creates {@link BackoffAlgorithm} implementation object, falls back to + * + * @param properties configuration properties + * @return {@link BackoffAlgorithm} implementation object + */ + static BackoffAlgorithm create(Properties properties) { + Map params = properties.entrySet().stream() + .collect(Collectors.toMap(e -> (String) e.getKey(), e -> (String) e.getValue())); + return create(params); + } + + /** + * Creates {@link BackoffAlgorithm} implementation object. + * + * @param params configuration parameters map + * @return {@link BackoffAlgorithm} implementation object + */ + static BackoffAlgorithm create(Map params) { + String className = params.getOrDefault(BACKOFF_IMPLEMENTATION_KEY, DEFAULT_BACKOFF_IMPLEMENTATION); + BackoffAlgorithm backoff; + try { + backoff = (BackoffAlgorithm) Class.forName(className).getDeclaredConstructor().newInstance(); + backoff.configure("Configuration", new HashMap<>(params)); + } catch (ReflectiveOperationException e) { + String msg = String.format("Failed to create backoff implementation for %s", className); + logger.warn(msg, e); + throw new RuntimeException(msg, e); + } catch (ConfigurationException e) { + String msg = String.format("Failed to configure backoff implementation for %s with parameters %s", + className, params); + logger.warn(msg, e); + throw new RuntimeException(msg, e); + } + + return backoff; + } +} diff --git a/utils/src/main/java/com/cloud/utils/backoff/impl/ConstantTimeBackoff.java b/utils/src/main/java/com/cloud/utils/backoff/impl/ConstantTimeBackoff.java index 043c77a9a0de..09163fd0832c 100644 --- a/utils/src/main/java/com/cloud/utils/backoff/impl/ConstantTimeBackoff.java +++ b/utils/src/main/java/com/cloud/utils/backoff/impl/ConstantTimeBackoff.java @@ -20,11 +20,13 @@ package com.cloud.utils.backoff.impl; import java.util.Collection; +import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import com.cloud.utils.NumbersUtil; import com.cloud.utils.backoff.BackoffAlgorithm; +import com.cloud.utils.backoff.BackoffFactory; import com.cloud.utils.component.AdapterBase; /** @@ -38,7 +40,17 @@ * } **/ public class ConstantTimeBackoff extends AdapterBase implements BackoffAlgorithm, ConstantTimeBackoffMBean { - long _time; + /** + * Property name for the delay between retries to be used either by {@code agent.properties} file or by configuration key. + */ + public static final String DELAY_SEC_CONFIG_KEY = "backoff.seconds"; + + /** + * Default value for the delay between retries for the property {@link ConstantTimeBackoff#DELAY_SEC_CONFIG_KEY}. + */ + public static final int DELAY_SEC_DEFAULT = 5; + + private long _time; private final Map _asleep = new ConcurrentHashMap(); @Override @@ -60,9 +72,17 @@ public void waitBeforeRetry() { public void reset() { } + @Override + public Map getConfiguration() { + Map configuration = new HashMap<>(); + configuration.put(BackoffFactory.BACKOFF_IMPLEMENTATION_KEY, getClass().getName()); + configuration.put(DELAY_SEC_CONFIG_KEY, String.valueOf(_time / 1000)); + return configuration; + } + @Override public boolean configure(String name, Map params) { - _time = NumbersUtil.parseLong((String)params.get("seconds"), 5) * 1000; + _time = NumbersUtil.parseLong((String) params.get(DELAY_SEC_CONFIG_KEY), DELAY_SEC_DEFAULT) * 1000; return true; } diff --git a/utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoff.java b/utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoff.java new file mode 100644 index 000000000000..4b077b87892e --- /dev/null +++ b/utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoff.java @@ -0,0 +1,166 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.utils.backoff.impl; + +import com.cloud.utils.DateUtil; +import com.cloud.utils.NumbersUtil; +import com.cloud.utils.backoff.BackoffAlgorithm; +import com.cloud.utils.backoff.BackoffFactory; +import com.cloud.utils.component.AdapterBase; + +import java.security.SecureRandom; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.ConcurrentHashMap; + +/** + * Exponential backoff with up/down cycling. + * Delay grows exponentially until a maximum, then decreases back to base, then repeats. + * + * @author mprokopchuk + */ +public class ExponentialWithJitterBackoff extends AdapterBase implements BackoffAlgorithm, + ExponentialWithJitterBackoffMBean { + + /** + * Property name for the minimal delay to be used either by {@code agent.properties} file or by configuration key. + */ + public static final String MIN_DELAY_MS_CONFIG_KEY = "backoff.min_delay_ms"; + + /** + * Property name for the maximal delay to be used either by {@code agent.properties} file or by configuration key. + */ + public static final String MAX_DELAY_MS_CONFIG_KEY = "backoff.max_delay_ms"; + + /** + * Default value for minimal delay for the property {@link ExponentialWithJitterBackoff#MIN_DELAY_MS_DEFAULT}. + */ + public static final int MIN_DELAY_MS_DEFAULT = 5_000; + + /** + * Default value for maximal delay for the property {@link ExponentialWithJitterBackoff#MAX_DELAY_MS_DEFAULT}. + */ + public static final int MAX_DELAY_MS_DEFAULT = 15_000; + + private final Map asleep = new ConcurrentHashMap<>(); + private final Random random = new SecureRandom(); + + private int minDelayMs; + private int maxDelayMs; + private int maxAttempts; + private int attemptNumber; + private boolean increasing; + + @Override + public void waitBeforeRetry() { + boolean interrupted = false; + long waitMs = getTimeToWait(); + Thread current = Thread.currentThread(); + try { + asleep.put(current.getName(), current); + logger.debug(String.format("Going to sleep for %s", DateUtil.formatMillis(waitMs))); + Thread.sleep(waitMs); + logger.debug(String.format("Sleep done for %s", DateUtil.formatMillis(waitMs))); + } catch (InterruptedException e) { + logger.info(String.format("Thread %s interrupted while waiting for retry", current.getName()), e); + } finally { + asleep.remove(current.getName()); + calculateNextAttempt(); + if (interrupted) { + Thread.currentThread().interrupt(); + } + } + } + + /** + * Calculates next attempt and direction. + */ + private void calculateNextAttempt() { + if (increasing) { + int nextAttemptNumber = attemptNumber + 1; + increasing = getNextDelay() <= maxDelayMs && nextAttemptNumber <= maxAttempts; + if (increasing) { + attemptNumber = nextAttemptNumber; + } + } else { + int nextAttemptNumber = Math.max(attemptNumber - 1, 0); + increasing = nextAttemptNumber == 0; + if (!increasing) { + attemptNumber = nextAttemptNumber; + } + } + } + + @Override + public void reset() { + attemptNumber = 0; + } + + @Override + public Map getConfiguration() { + Map configuration = new HashMap<>(); + configuration.put(MIN_DELAY_MS_CONFIG_KEY, String.valueOf(minDelayMs)); + configuration.put(MAX_DELAY_MS_CONFIG_KEY, String.valueOf(maxDelayMs)); + configuration.put(BackoffFactory.BACKOFF_IMPLEMENTATION_KEY, this.getClass().getName()); + return configuration; + } + + @Override + public boolean configure(String name, Map params) { + minDelayMs = NumbersUtil.parseInt((String) params.get(MIN_DELAY_MS_CONFIG_KEY), MIN_DELAY_MS_DEFAULT); + maxDelayMs = NumbersUtil.parseInt((String) params.get(MAX_DELAY_MS_CONFIG_KEY), MAX_DELAY_MS_DEFAULT); + maxAttempts = (int) Math.round(Math.log((double) maxDelayMs / minDelayMs) / Math.log(2)); + + attemptNumber = random.nextInt(maxAttempts + 1); + increasing = random.nextBoolean(); + // do nothing + return true; + } + + @Override + public Collection getWaiters() { + return asleep.keySet(); + } + + @Override + public boolean wakeup(String threadName) { + Thread th = asleep.get(threadName); + if (th != null) { + th.interrupt(); + return true; + } + return false; + } + + private long getNextDelay() { + return (long) Math.min(minDelayMs * Math.pow(2, attemptNumber), maxDelayMs); + } + + @Override + public long getTimeToWait() { + long delay = getNextDelay(); + int jitter = random.nextInt((int) delay / 2); + return delay + jitter; + } + + @Override + public void setTimeToWait(long seconds) { + // ignore + } +} diff --git a/utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoffMBean.java b/utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoffMBean.java new file mode 100644 index 000000000000..43d1a0a7ad3a --- /dev/null +++ b/utils/src/main/java/com/cloud/utils/backoff/impl/ExponentialWithJitterBackoffMBean.java @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package com.cloud.utils.backoff.impl; + +import com.cloud.utils.mgmt.ManagementBean; + +import java.util.Collection; + +public interface ExponentialWithJitterBackoffMBean extends ManagementBean { + long getTimeToWait(); + + void setTimeToWait(long seconds); + + Collection getWaiters(); + + boolean wakeup(String threadName); + +} diff --git a/utils/src/main/java/com/cloud/utils/exception/CSExceptionErrorCode.java b/utils/src/main/java/com/cloud/utils/exception/CSExceptionErrorCode.java index fd31db8a5a3f..ac1c0b5934a4 100644 --- a/utils/src/main/java/com/cloud/utils/exception/CSExceptionErrorCode.java +++ b/utils/src/main/java/com/cloud/utils/exception/CSExceptionErrorCode.java @@ -69,6 +69,7 @@ public class CSExceptionErrorCode { ExceptionErrorCodeMap.put("com.cloud.exception.StorageUnavailableException", 4385); ExceptionErrorCodeMap.put("com.cloud.exception.UnsupportedServiceException", 4390); ExceptionErrorCodeMap.put("com.cloud.exception.VirtualMachineMigrationException", 4395); + ExceptionErrorCodeMap.put("com.cloud.exception.ConnectionException", 4435); ExceptionErrorCodeMap.put("com.cloud.async.AsyncCommandQueued", 4540); ExceptionErrorCodeMap.put("com.cloud.exception.RequestLimitException", 4545); ExceptionErrorCodeMap.put("com.cloud.exception.StorageConflictException", 4550); diff --git a/utils/src/main/java/com/cloud/utils/net/NetUtils.java b/utils/src/main/java/com/cloud/utils/net/NetUtils.java index 65878e055e73..97cda1eed0c0 100644 --- a/utils/src/main/java/com/cloud/utils/net/NetUtils.java +++ b/utils/src/main/java/com/cloud/utils/net/NetUtils.java @@ -178,6 +178,9 @@ public static InetAddress getLocalInetAddress() { } public static String resolveToIp(final String host) { + if (host == null) { + return null; + } try { final InetAddress addr = InetAddress.getByName(host); return addr.getHostAddress(); diff --git a/utils/src/main/java/com/cloud/utils/nio/HandlerFactory.java b/utils/src/main/java/com/cloud/utils/nio/HandlerFactory.java index 9493f24b92b4..c858740646f3 100644 --- a/utils/src/main/java/com/cloud/utils/nio/HandlerFactory.java +++ b/utils/src/main/java/com/cloud/utils/nio/HandlerFactory.java @@ -19,7 +19,7 @@ package com.cloud.utils.nio; -import java.net.SocketAddress; +import java.net.InetSocketAddress; /** * WorkerFactory creates and selects workers. @@ -32,6 +32,6 @@ default int getMaxConcurrentNewConnectionsCount() { default int getNewConnectionsCount() { return 0; } - default void registerNewConnection(SocketAddress address) {} - default void unregisterNewConnection(SocketAddress address) {} + default void registerNewConnection(InetSocketAddress address) {} + default void unregisterNewConnection(InetSocketAddress address) {} } diff --git a/utils/src/main/java/com/cloud/utils/nio/Link.java b/utils/src/main/java/com/cloud/utils/nio/Link.java index 18bbb0533eeb..b53c3d99d377 100644 --- a/utils/src/main/java/com/cloud/utils/nio/Link.java +++ b/utils/src/main/java/com/cloud/utils/nio/Link.java @@ -31,6 +31,7 @@ import java.security.GeneralSecurityException; import java.security.KeyStore; import java.security.SecureRandom; +import java.util.Optional; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.Executor; import java.util.concurrent.Executors; @@ -63,6 +64,7 @@ public class Link { private final InetSocketAddress _addr; private final NioConnection _connection; private SelectionKey _key; + private Integer _localPort; private final ConcurrentLinkedQueue _writeQueue; private ByteBuffer _readBuffer; private ByteBuffer _plaintextBuffer; @@ -76,11 +78,8 @@ public Link(InetSocketAddress addr, NioConnection connection) { _addr = addr; _connection = connection; _readBuffer = ByteBuffer.allocate(2048); - _attach = null; - _key = null; _writeQueue = new ConcurrentLinkedQueue(); _readHeader = true; - _gotFollowingPacket = false; } public Link(Link link) { @@ -98,6 +97,26 @@ public void attach(Object attach) { public void setKey(SelectionKey key) { synchronized (this) { _key = key; + try { + _localPort = Optional.ofNullable(_key) + .map(SelectionKey::channel) + .filter(SocketChannel.class::isInstance) + .map(SocketChannel.class::cast) + .map(channel -> { + try { + return channel.getLocalAddress(); + } catch (IOException e) { + return null; + } + }) + .filter(InetSocketAddress.class::isInstance) + .map(InetSocketAddress.class::cast) + .map(InetSocketAddress::getPort) + .orElse(null); + + } catch (Exception e) { + // do nothing + } } } @@ -316,6 +335,9 @@ public void send(ByteBuffer[] data, boolean close) throws ClosedChannelException } synchronized (this) { if (_key == null) { + if (LOGGER.isTraceEnabled()) { + LOGGER.trace("SelectionKey is null for {}", _addr); + } throw new ClosedChannelException(); } _connection.change(SelectionKey.OP_WRITE, _key, null); @@ -354,16 +376,28 @@ public InetSocketAddress getSocketAddress() { return _addr; } + public Integer getLocalPort() { + return _localPort; + } + public String getIpAddress() { return _addr.getAddress().toString(); } public synchronized void terminated() { + if (LOGGER.isTraceEnabled()) { + LOGGER.debug("Terminating connection to {}", _addr); + } _key = null; } + public boolean isTerminated() { + return _key == null; + } + public synchronized void schedule(Task task) throws ClosedChannelException { if (_key == null) { + LOGGER.warn("Failed to schedule task as SelectionKey is null for {}", _addr); throw new ClosedChannelException(); } _connection.scheduleTask(task); @@ -504,8 +538,8 @@ private static HandshakeHolder doHandshakeUnwrap(final SocketChannel socketChann result = sslEngine.unwrap(peerNetData, peerAppData); peerNetData.compact(); } catch (final SSLException sslException) { - LOGGER.error(String.format("SSL error caught during unwrap data: %s, for local address=%s, remote address=%s. The client may have invalid ca-certificates.", - sslException.getMessage(), socketChannel.getLocalAddress(), socketChannel.getRemoteAddress())); + LOGGER.error("SSL error caught during unwrap data: {}, for local address={}, remote address={}. The client may have invalid ca-certificates.", + sslException.getMessage(), socketChannel.getLocalAddress(), socketChannel.getRemoteAddress(), sslException); sslEngine.closeOutbound(); return new HandshakeHolder(peerAppData, peerNetData, false); } @@ -683,4 +717,16 @@ boolean isSuccess() { } } + public String toString() { + StringBuilder str = new StringBuilder(); + if (LOGGER.isTraceEnabled()) { + str.append(System.identityHashCode(this)).append("-"); + } + Integer localPort = getLocalPort(); + if (localPort != null) { + str.append(localPort).append("-> "); + } + str.append(getSocketAddress()); + return str.toString(); + } } diff --git a/utils/src/main/java/com/cloud/utils/nio/NioClient.java b/utils/src/main/java/com/cloud/utils/nio/NioClient.java index d274973a6584..7887b52a8a70 100644 --- a/utils/src/main/java/com/cloud/utils/nio/NioClient.java +++ b/utils/src/main/java/com/cloud/utils/nio/NioClient.java @@ -21,16 +21,27 @@ import java.io.IOException; import java.net.InetSocketAddress; +import java.net.SocketAddress; +import java.nio.channels.CancelledKeyException; +import java.nio.channels.ClosedSelectorException; +import java.nio.channels.SelectableChannel; import java.nio.channels.SelectionKey; import java.nio.channels.Selector; import java.nio.channels.SocketChannel; import java.security.GeneralSecurityException; +import java.util.Arrays; +import java.util.Optional; +import java.util.Set; import javax.net.ssl.SSLContext; import javax.net.ssl.SSLEngine; +import com.cloud.utils.LogUtils; import org.apache.cloudstack.utils.security.SSLUtils; +/** + * NIO Client for the {@code com.cloud.agent.Agent}. + */ public class NioClient extends NioConnection { protected String host; @@ -43,57 +54,65 @@ public NioClient(final String name, final String host, final int port, final int this.host = host; } - protected void closeChannel() { - try { - if (clientConnection != null && clientConnection.isOpen()) { - clientConnection.close(); - } - } catch (IOException e) { - logger.error("Failed to close SocketChannel", e); - } - } - @Override protected void init() throws IOException { Task task; - String hostLog = host + ":" + _port; + String hostLog = LogUtils.getHostLog(host, _port); try { logger.info("Connecting to {}", hostLog); _selector = Selector.open(); clientConnection = SocketChannel.open(); + clientConnection.socket().setKeepAlive(true); + final InetSocketAddress serverAddress = new InetSocketAddress(host, _port); - clientConnection.connect(serverAddress); - logger.info("Connected to {}", hostLog); + boolean isConnected = clientConnection.connect(serverAddress); + logger.info("Connected to {}: {}", hostLog, isConnected); + clientConnection.configureBlocking(false); + logger.debug("Initializing client SSL context"); final SSLContext sslContext = Link.initClientSSLContext(); + logger.debug("Initialized client SSL context"); + + logger.debug("Creating SSL Engine for {}", hostLog); SSLEngine sslEngine = sslContext.createSSLEngine(host, _port); sslEngine.setUseClientMode(true); - sslEngine.setEnabledProtocols(SSLUtils.getSupportedProtocols(sslEngine.getEnabledProtocols())); + logger.debug("Created SSL Engine for {}", hostLog); + String[] enabledProtocols = SSLUtils.getSupportedProtocols(sslEngine.getEnabledProtocols()); + logger.debug("Enabled SSL Engine protocols for {}: {}", hostLog, Arrays.asList(enabledProtocols)); + sslEngine.setEnabledProtocols(enabledProtocols); + + Integer sshHandshakeTimeout = getSslHandshakeTimeout(); + logger.debug("Begin SSL Handshake for {} with timeout {}s (default 30s)", hostLog, sshHandshakeTimeout); sslEngine.beginHandshake(); - if (!Link.doHandshake(clientConnection, sslEngine, getSslHandshakeTimeout())) { + if (!Link.doHandshake(clientConnection, sslEngine, sshHandshakeTimeout)) { throw new IOException(String.format("SSL Handshake failed while connecting to host: %s", hostLog)); } - logger.info("SSL: Handshake done"); + logger.info("SSL Handshake done for {}", hostLog); final Link link = new Link(serverAddress, this); link.setSSLEngine(sslEngine); final SelectionKey key = clientConnection.register(_selector, SelectionKey.OP_READ); link.setKey(key); key.attach(link); - // Notice we've already connected due to the handshake, so let's get the - // remaining task done + + logger.info("Creating task {} for {}", Task.Type.CONNECT, hostLog); + // Notice we've already connected due to the handshake, so let's get the remaining task done task = _factory.create(Task.Type.CONNECT, link, null); - } catch (final GeneralSecurityException e) { - closeChannel(); - throw new IOException("Failed to initialise security", e); - } catch (final IOException e) { - closeChannel(); - logger.error("IOException while connecting to {}", hostLog, e); + logger.info("Created task {} for {}", Task.Type.CONNECT, hostLog); + } catch (GeneralSecurityException e) { + cleanUp(); + throw new IOException(String.format("Exception while connecting to %s", hostLog), e); + } catch (Exception e) { + cleanUp(); throw e; } if (task != null) { + logger.info("Submit task {} for {}", task.getType(), hostLog); _executor.submit(task); + logger.info("Submitted task {} for {}", task.getType(), hostLog); + } else { + logger.info("Task is null, nothing to submit for {}", hostLog); } } @@ -109,11 +128,85 @@ protected void unregisterLink(final InetSocketAddress address) { @Override public void cleanUp() throws IOException { - super.cleanUp(); - if (clientConnection != null && clientConnection.isOpen()) { - clientConnection.close(); - } - logger.info("NioClient connection closed"); + Optional selectorOptional = Optional.ofNullable(_selector); + selectorOptional + .filter(Selector::isOpen) + .map(selector -> { + Set keys; + try { + logger.trace("Getting keys from Selector"); + keys = selector.keys(); + logger.trace("Got {} keys from Selector", keys.size()); + } catch (ClosedSelectorException e) { + logger.trace("Failed to get keys from Selector", e); + keys = Set.of(); + } + return keys; + }) + .orElseGet(Set::of) + .forEach(key -> { + try { + if (key.isValid()) { + logger.trace("Cancelling SelectionKey"); + key.cancel(); + logger.trace("Cancelled SelectionKey"); + } else { + logger.trace("SelectionKey already cancelled"); + } + } catch (CancelledKeyException e) { + logger.trace("Failed to cancel SelectionKey", e); + } + Optional.ofNullable(key.channel()) + .filter(SelectableChannel::isOpen) + .ifPresent(channel -> { + try { + logger.trace("Closing SelectableChannel"); + channel.close(); + logger.trace("Closed SelectableChannel"); + } catch (IOException e) { + logger.trace("Failed to close SelectableChannel", e); + } + }); + }); + + selectorOptional.ifPresent(selector -> { + try { + logger.trace("Closing Selector"); + selector.close(); + logger.trace("Closed Selector"); + } catch (IOException e) { + logger.trace("Failed to close Selector", e); + } + }); + + // socket channel should be closed here already, but just in case + Optional.ofNullable(clientConnection) + .filter(SocketChannel::isOpen) + .ifPresent(socketChannel -> { + SocketAddress address; + try { + address = socketChannel.getRemoteAddress(); + } catch (IOException e) { + logger.trace("Failed to get SocketAddress from SocketChannel", e); + address = null; + } + try { + socketChannel.shutdownOutput(); + } catch (IOException e) { + logger.trace("Failed to shutdown output in SocketChannel", e); + } + try { + socketChannel.shutdownInput(); + } catch (IOException e) { + logger.trace("Failed to shutdown input in SocketChannel", e); + } + try { + socketChannel.close(); + } catch (IOException e) { + logger.trace("Failed to close SocketChannel", e); + } + logger.info("NioClient connection to {} closed", address); + }); } public String getHost() { diff --git a/utils/src/main/java/com/cloud/utils/nio/NioConnection.java b/utils/src/main/java/com/cloud/utils/nio/NioConnection.java index 8e1a208a164e..2eccbf8a8759 100644 --- a/utils/src/main/java/com/cloud/utils/nio/NioConnection.java +++ b/utils/src/main/java/com/cloud/utils/nio/NioConnection.java @@ -36,6 +36,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Optional; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; @@ -70,8 +71,8 @@ public abstract class NioConnection implements Callable { protected ExecutorService _threadExecutor; protected Future _futureTask; - protected boolean _isRunning; - protected boolean _isStartup; + protected volatile boolean _isRunning; + protected volatile boolean _isStartup; protected int _port; protected int _workers; protected List _todos; @@ -80,66 +81,72 @@ public abstract class NioConnection implements Callable { protected ExecutorService _executor; protected ExecutorService _sslHandshakeExecutor; protected CAService caService; - protected Set socketChannels = ConcurrentHashMap.newKeySet(); - protected Integer sslHandshakeTimeout = null; + protected Set socketChannels; + protected Integer sslHandshakeTimeout; private final int factoryMaxNewConnectionsCount; - protected boolean blockNewConnections; + protected volatile boolean blockNewConnections; public NioConnection(final String name, final int port, final int workers, final HandlerFactory factory) { + socketChannels = ConcurrentHashMap.newKeySet(); _name = name; - _isRunning = false; - blockNewConnections = false; - _selector = null; _port = port; _workers = workers; _factory = factory; this.factoryMaxNewConnectionsCount = factory.getMaxConcurrentNewConnectionsCount(); - initWorkersExecutor(); - initSSLHandshakeExecutor(); } - public void setCAService(final CAService caService) { + public void setCAService(CAService caService) { this.caService = caService; } public void start() throws NioConnectionException { _todos = new ArrayList<>(); + if (_executor == null || _executor.isShutdown() || _executor.isTerminated()) { + initWorkersExecutor(); + } + + if (_sslHandshakeExecutor == null || _sslHandshakeExecutor.isShutdown() || _sslHandshakeExecutor.isTerminated()) { + initSSLHandshakeExecutor(); + } + try { init(); } catch (final ConnectException e) { - logger.warn("Unable to connect to remote: is there a server running on port {}?", _port, e); - throw new NioConnectionException(e.getMessage(), e); - } catch (final IOException e) { - logger.error("Unable to initialize the threads.", e); - throw new NioConnectionException(e.getMessage(), e); + String msg = String.format("Unable to connect to remote: is there a server running on port %s: %s", _port, e.getMessage()); + throw new NioConnectionException(msg, e); } catch (final Exception e) { - logger.error("Unable to initialize the threads due to unknown exception.", e); - throw new NioConnectionException(e.getMessage(), e); + String msg = String.format("Unable to initialize the threads: %s", e.getMessage()); + throw new NioConnectionException(msg, e); } _isStartup = true; - if (_executor.isShutdown()) { - initWorkersExecutor(); - } - if (_sslHandshakeExecutor.isShutdown()) { - initSSLHandshakeExecutor(); - } _threadExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory(this._name + "-NioConnectionHandler")); _isRunning = true; + // in case start() called after stop() blockNewConnections = false; _futureTask = _threadExecutor.submit(this); } public void stop() { - _executor.shutdown(); - _sslHandshakeExecutor.shutdown(); - _isRunning = false; - blockNewConnections = true; + if (_executor != null) { + if (logger.isTraceEnabled()) { + logger.trace("Shutting down handler tasks"); + } + _executor.shutdownNow(); + } + if (_sslHandshakeExecutor != null) { + if (logger.isTraceEnabled()) { + logger.trace("Shutting down SSL Handshake executor"); + } + _sslHandshakeExecutor.shutdownNow(); + } if (_threadExecutor != null) { _futureTask.cancel(false); - _threadExecutor.shutdown(); + _threadExecutor.shutdownNow(); } + _isRunning = false; + blockNewConnections = true; } private void initWorkersExecutor() { @@ -174,44 +181,46 @@ public Boolean call() throws NioConnectionException { _selector.select(50); // Someone is ready for I/O, get the ready keys - final Set readyKeys = _selector.selectedKeys(); - final Iterator i = readyKeys.iterator(); + final Set keys = _selector.selectedKeys(); - logger.trace("Keys Processing: {}", readyKeys.size()); // Walk through the ready keys collection. - while (i.hasNext()) { - final SelectionKey sk = i.next(); - i.remove(); - - if (!sk.isValid()) { - logger.trace("Selection Key is invalid: {}", sk); - final Link link = (Link)sk.attachment(); + for (final Iterator it = keys.iterator(); it.hasNext();) { + final SelectionKey key = it.next(); + it.remove(); + + // isValid() means key is not cancelled and channel is not closed + if (!key.isValid()) { + logger.trace("Selection Key is invalid: {}", key); + final Link link = (Link)key.attachment(); if (link != null) { link.terminated(); } else { - closeConnection(sk); + closeConnection(key); } - } else if (sk.isReadable()) { - read(sk); - } else if (sk.isWritable()) { - write(sk); - } else if (sk.isAcceptable()) { - accept(sk); - } else if (sk.isConnectable()) { - connect(sk); + } else if (key.isReadable()) { + read(key); + } else if (key.isWritable()) { + write(key); + // this is for NIO Server only to accept new connection + } else if (key.isAcceptable()) { + accept(key); + // this is for NIO Client only to finish connect + } else if (key.isConnectable()) { + connect(key); } } - logger.trace("Keys Done Processing."); - processTodos(); + // and we cannot use existing selector anymore } catch (final ClosedSelectorException e) { /* - * Exception occurred when calling java.nio.channels.Selector.selectedKeys() method. It means the connection has not yet been established. Let's continue trying + * Exception occurred when calling java.nio.channels.Selector.selectedKeys() method. + * It means the connection has not yet been established. Let's continue trying. * We do not log it here otherwise we will fill the disk with messages. */ - } catch (final IOException e) { - logger.error("Agent will die due to this IOException!", e); + } catch (final Exception e) { + _isStartup = false; + logger.error("Agent will die due to this Exception", e); throw new NioConnectionException(e.getMessage(), e); } } @@ -229,7 +238,18 @@ protected boolean rejectConnectionIfBlocked(final SocketChannel socketChannel) t if (!blockNewConnections) { return false; } - logger.warn("Rejecting new connection as the server is blocked from accepting new connections"); + + try { + socketChannel.shutdownOutput(); + } catch (IOException ignore) { + // ignore exception + } + try { + socketChannel.shutdownInput(); + } catch (IOException ignore) { + // ignore exception + } + logger.warn("Rejecting new connection from {} as the server is blocked from accepting new connections", socketChannel.getRemoteAddress()); socketChannel.close(); _selector.wakeup(); return true; @@ -239,9 +259,19 @@ protected boolean rejectConnectionIfBusy(final SocketChannel socketChannel) thro if (factoryMaxNewConnectionsCount <= 0 || _factory.getNewConnectionsCount() < factoryMaxNewConnectionsCount) { return false; } + try { + socketChannel.shutdownOutput(); + } catch (IOException ignore) { + // ignore exception + } + try { + socketChannel.shutdownInput(); + } catch (IOException ignore) { + // ignore exception + } // Reject new connection if the server is busy - logger.warn("{} Rejecting new connection. {} active connections currently", - SERVER_BUSY_MESSAGE, factoryMaxNewConnectionsCount); + logger.warn("{} Rejecting new connection from {}. {} active connections currently", + SERVER_BUSY_MESSAGE, socketChannel.getRemoteAddress(), factoryMaxNewConnectionsCount); socketChannel.close(); _selector.wakeup(); return true; @@ -264,9 +294,12 @@ protected void accept(final SelectionKey key) throws IOException { final NioConnection nioConnection = this; _sslHandshakeExecutor.submit(() -> { final InetSocketAddress socketAddress = (InetSocketAddress)socket.getRemoteSocketAddress(); - _factory.registerNewConnection(socketAddress); - _selector.wakeup(); try { + if (logger.isTraceEnabled()) { + logger.trace("Registering new connection for {}", socketAddress.getAddress()); + } + _factory.registerNewConnection(socketAddress); + _selector.wakeup(); final SSLEngine sslEngine = Link.initServerSSLEngine(caService, socketChannel.getRemoteAddress().toString()); sslEngine.setUseClientMode(false); sslEngine.setEnabledProtocols(SSLUtils.getSupportedProtocols(sslEngine.getEnabledProtocols())); @@ -274,20 +307,26 @@ protected void accept(final SelectionKey key) throws IOException { if (!Link.doHandshake(socketChannel, sslEngine, getSslHandshakeTimeout())) { throw new IOException("SSL handshake timed out with " + socketAddress); } - logger.trace("SSL: Handshake done"); + logger.trace("SSL Handshake done for {}", socketAddress); final Link link = new Link(socketAddress, nioConnection); link.setSSLEngine(sslEngine); link.setKey(socketChannel.register(key.selector(), SelectionKey.OP_READ, link)); final Task task = _factory.create(Task.Type.CONNECT, link, null); registerLink(socketAddress, link); _executor.submit(task); - } catch (final GeneralSecurityException | IOException e) { + } catch (final GeneralSecurityException | IOException | RuntimeException e) { _factory.unregisterNewConnection(socketAddress); - logger.trace("Connection closed with {} due to failure: {}", socket.getRemoteSocketAddress(), e.getMessage()); + logger.trace("Connection closed with {} due to failure: {}", socket.getRemoteSocketAddress(), e.getMessage(), e); closeAutoCloseable(socket, "accepting socket"); closeAutoCloseable(socketChannel, "accepting socketChannel"); } finally { - _selector.wakeup(); + try { + _selector.wakeup(); + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to wakeup selector for {}", socketAddress, e); + } + } } }); } catch (final RejectedExecutionException e) { @@ -295,15 +334,21 @@ protected void accept(final SelectionKey key) throws IOException { closeAutoCloseable(socket, "Rejecting connection - accepting socket"); closeAutoCloseable(socketChannel, "Rejecting connection - accepting socketChannel"); } finally { - _selector.wakeup(); + try { + _selector.wakeup(); + } catch (final Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to wakeup selector for {}", socket, e); + } + } } } - protected void terminate(final SelectionKey key, String msg) { + protected void terminate(final SelectionKey key, final Exception cause) { final Link link = (Link)key.attachment(); closeConnection(key); if (link != null) { - logger.trace("Will terminate connection due to: {}", msg); + logger.trace("Will terminate connection to {} due to: {}", link.getSocketAddress(), cause.getLocalizedMessage(), cause); link.terminated(); final Task task = _factory.create(Task.Type.DISCONNECT, link, null); unregisterLink(link.getSocketAddress()); @@ -311,7 +356,7 @@ protected void terminate(final SelectionKey key, String msg) { try { _executor.submit(task); } catch (final Exception e) { - logger.warn("Exception occurred when submitting the task", e); + logger.warn("Exception occurred when submitting the task for {}", link.getSocketAddress(), e); } } } @@ -320,7 +365,7 @@ protected void read(final SelectionKey key) throws IOException { final Link link = (Link)key.attachment(); try { final SocketChannel socketChannel = (SocketChannel)key.channel(); - logger.trace("Reading from: {}", socketChannel.socket().toString()); + logger.trace("Reading from: {}", socketChannel.socket()); final byte[] data = link.read(socketChannel); if (data == null) { logger.trace("Packet is incomplete. Waiting for more."); @@ -331,11 +376,11 @@ protected void read(final SelectionKey key) throws IOException { try { _executor.submit(task); } catch (final Exception e) { - logger.warn("Exception occurred when submitting the task", e); + logger.warn("Exception occurred when submitting the task for {}", link.getSocketAddress(), e); } } catch (final Exception e) { logDebug(e, key, 1); - terminate(key, e.getMessage()); + terminate(key, e); } } @@ -349,7 +394,9 @@ protected void logTrace(final Exception e, final SelectionKey key, final int loc } } - logger.trace("Location " + loc + ": Socket " + socket + " closed on read. Probably -1 returned."); + if (logger.isTraceEnabled()) { + logger.trace("Location {}: Socket {} closed on read. Probably -1 returned.", loc, socket, e); + } } } @@ -363,7 +410,7 @@ protected void logDebug(final Exception e, final SelectionKey key, final int loc } } - logger.debug("Location " + loc + ": Socket " + socket + " closed on read. Probably -1 returned: " + e.getMessage()); + logger.debug("Location " + loc + ": Socket " + socket + " closed on read. Probably -1 returned: " + e.getMessage(), e); } } @@ -383,55 +430,55 @@ protected void processTodos() { SelectionKey key; for (final ChangeRequest todo : todos) { switch (todo.type) { - case ChangeRequest.CHANGEOPS: - try { - key = (SelectionKey)todo.key; - if (key != null && key.isValid()) { + case ChangeRequest.CHANGEOPS: + try { + key = (SelectionKey)todo.key; + if (key != null && key.isValid()) { + if (todo.att != null) { + key.attach(todo.att); + final Link link = (Link)todo.att; + link.setKey(key); + } + key.interestOps(todo.ops); + } + } catch (final CancelledKeyException e) { + logger.debug("key has been cancelled", e); + } + break; + case ChangeRequest.REGISTER: + try { + key = ((SocketChannel)todo.key).register(_selector, todo.ops, todo.att); if (todo.att != null) { - key.attach(todo.att); final Link link = (Link)todo.att; link.setKey(key); } - key.interestOps(todo.ops); - } - } catch (final CancelledKeyException e) { - logger.debug("key has been cancelled"); - } - break; - case ChangeRequest.REGISTER: - try { - key = ((SocketChannel)todo.key).register(_selector, todo.ops, todo.att); - if (todo.att != null) { - final Link link = (Link)todo.att; - link.setKey(key); - } - } catch (final ClosedChannelException e) { - logger.warn("Couldn't register socket: {}", todo.key); - try { - ((SocketChannel)todo.key).close(); - } catch (final IOException ignore) { - logger.info("[ignored] socket channel"); - } finally { - final Link link = (Link)todo.att; - link.terminated(); + } catch (final ClosedChannelException e) { + logger.warn("Couldn't register socket: {}", todo.key); + try { + ((SocketChannel)todo.key).close(); + } catch (final IOException ignore) { + logger.info("[ignored] socket channel"); + } finally { + final Link link = (Link)todo.att; + link.terminated(); + } } - } - break; - case ChangeRequest.CLOSE: - logger.trace("Trying to close {}", todo.key); - key = (SelectionKey)todo.key; - closeConnection(key); - if (key != null) { - final Link link = (Link)key.attachment(); - if (link != null) { - link.terminated(); + break; + case ChangeRequest.CLOSE: + logger.trace("Trying to close {}", todo.key); + key = (SelectionKey)todo.key; + closeConnection(key); + if (key != null) { + final Link link = (Link)key.attachment(); + if (link != null) { + link.terminated(); + } } + break; + default: + logger.warn("Shouldn't be here"); + throw new RuntimeException("Shouldn't be here"); } - break; - default: - logger.warn("Shouldn't be here"); - throw new RuntimeException("Shouldn't be here"); - } } logger.trace("Todos Done processing"); } @@ -459,7 +506,7 @@ protected void connect(final SelectionKey key) throws IOException { } } catch (final IOException e) { logTrace(e, key, 2); - terminate(key, e.getMessage()); + terminate(key, e); } } @@ -467,14 +514,18 @@ protected void scheduleTask(final Task task) { try { _executor.submit(task); } catch (final Exception e) { - logger.warn("Exception occurred when submitting the task", e); + InetSocketAddress socketAddress = Optional.ofNullable(task) + .map(Task::getLink) + .map(Link::getSocketAddress) + .orElse(null); + logger.warn("Exception occurred when submitting the task for {}", socketAddress, e); } } protected void write(final SelectionKey key) throws IOException { final Link link = (Link)key.attachment(); try { - logger.trace("Writing to {}", link.getSocketAddress().toString()); + logger.trace("Writing to {}", link.getSocketAddress()); final boolean close = link.write((SocketChannel)key.channel()); if (close) { closeConnection(key); @@ -484,7 +535,7 @@ protected void write(final SelectionKey key) throws IOException { } } catch (final Exception e) { logDebug(e, key, 3); - terminate(key, e.getMessage()); + terminate(key, e); } } @@ -506,16 +557,11 @@ protected void closeConnection(final SelectionKey key) { // 2. Cancel the key (safe to call even if already cancelled) key.cancel(); - if (channel == null) { - logger.trace("Channel was null, invalid, or not a SocketChannel for key: " + key); - return; - } - // 3. Try to close the channel if we obtained it if (channel != null) { closeChannel(channel); } else { - logger.trace("Channel was null, invalid, or not a SocketChannel for key: " + key); + logger.trace("Channel was null, invalid, or not a SocketChannel for key: {}", key); } } diff --git a/utils/src/main/java/com/cloud/utils/nio/NioServer.java b/utils/src/main/java/com/cloud/utils/nio/NioServer.java index c4f44afeccec..1fc426e4356a 100644 --- a/utils/src/main/java/com/cloud/utils/nio/NioServer.java +++ b/utils/src/main/java/com/cloud/utils/nio/NioServer.java @@ -23,12 +23,15 @@ import java.net.InetSocketAddress; import java.nio.channels.ClosedChannelException; import java.nio.channels.SelectionKey; +import java.nio.channels.Selector; import java.nio.channels.ServerSocketChannel; -import java.nio.channels.spi.SelectorProvider; import java.util.concurrent.ConcurrentHashMap; import org.apache.cloudstack.framework.ca.CAService; +/** + * NIO Server for the {@code com.cloud.agent.manager.AgentManagerImpl}. + */ public class NioServer extends NioConnection { protected InetSocketAddress localAddress; @@ -41,7 +44,6 @@ public NioServer(final String name, final int port, final int workers, final Han super(name, port, workers, factory); setCAService(caService); setSslHandshakeTimeout(sslHandShakeTimeout); - localAddress = null; links = new ConcurrentHashMap<>(1024); } @@ -51,7 +53,7 @@ public int getPort() { @Override protected void init() throws IOException { - _selector = SelectorProvider.provider().openSelector(); + _selector = Selector.open(); serverSocket = ServerSocketChannel.open(); serverSocket.configureBlocking(false); @@ -59,7 +61,7 @@ protected void init() throws IOException { localAddress = new InetSocketAddress(_port); serverSocket.socket().bind(localAddress); - serverSocket.register(_selector, SelectionKey.OP_ACCEPT, null); + serverSocket.register(_selector, SelectionKey.OP_ACCEPT); logger.info("NioServer started and listening on {}", serverSocket.socket().getLocalSocketAddress()); } diff --git a/utils/src/test/java/com/cloud/utils/backoff/impl/ConstantTimeBackoffTest.java b/utils/src/test/java/com/cloud/utils/backoff/impl/ConstantTimeBackoffTest.java index c822d6277ada..b471dba6c491 100644 --- a/utils/src/test/java/com/cloud/utils/backoff/impl/ConstantTimeBackoffTest.java +++ b/utils/src/test/java/com/cloud/utils/backoff/impl/ConstantTimeBackoffTest.java @@ -75,7 +75,7 @@ public void configureEmpty() { public void configureWithValue() { final ConstantTimeBackoff backoff = new ConstantTimeBackoff(); HashMap params = new HashMap(); - params.put("seconds", "100"); + params.put("backoff.seconds", "100"); backoff.configure("foo", params); Assert.assertEquals(100000, backoff.getTimeToWait()); } diff --git a/utils/src/test/java/com/cloud/utils/testcase/NioTest.java b/utils/src/test/java/com/cloud/utils/testcase/NioTest.java index 7d605e9f92b2..b35d26b0b09a 100644 --- a/utils/src/test/java/com/cloud/utils/testcase/NioTest.java +++ b/utils/src/test/java/com/cloud/utils/testcase/NioTest.java @@ -100,7 +100,7 @@ public void setUp() { testBytes = new byte[1000000]; randomGenerator.nextBytes(testBytes); - server = new NioServer("NioTestServer", 0, 1, new NioTestServer(), null, null); + server = new NioServer("NioTestServer", 0, 1, new NioTestServer(), null, null); try { server.start(); } catch (final NioConnectionException e) { @@ -245,8 +245,7 @@ public void doTask(final Task task) { logger.info("Sending data to server"); task.getLink().send(getTestBytes()); } catch (ClosedChannelException e) { - logger.error(e.getMessage()); - e.printStackTrace(); + logger.error(e.getMessage(), e); } } else if (task.getType() == Task.Type.DATA) { logger.info("Client: Received DATA task"); From d7a044d519c3393e6206f0990eec5f2d67f08580 Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Wed, 15 Apr 2026 11:18:17 +0530 Subject: [PATCH 2/2] fix test --- agent/src/test/java/com/cloud/agent/AgentTest.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/agent/src/test/java/com/cloud/agent/AgentTest.java b/agent/src/test/java/com/cloud/agent/AgentTest.java index 0932fdc82f98..9294f4de6b9f 100644 --- a/agent/src/test/java/com/cloud/agent/AgentTest.java +++ b/agent/src/test/java/com/cloud/agent/AgentTest.java @@ -33,7 +33,6 @@ import java.io.IOException; import java.lang.reflect.Field; -import java.lang.reflect.Modifier; import java.net.InetSocketAddress; import javax.naming.ConfigurationException; @@ -75,9 +74,6 @@ public void testGetLinkLogLinkWithTraceEnabledReturnsLinkLogWithHashCode() throw // fix LOGGER access to inject mock Field field = link.getClass().getDeclaredField("LOGGER"); field.setAccessible(true); - Field modifiersField = Field.class.getDeclaredField("modifiers"); - modifiersField.setAccessible(true); - modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL); field.set(null, logger); when(logger.isTraceEnabled()).thenReturn(true);