Skip to content

Commit bfbb8bb

Browse files
committed
Allow cleaning up of networks stuck in Implementing state
1 parent 4348386 commit bfbb8bb

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

framework/jobs/src/main/java/org/apache/cloudstack/framework/jobs/impl/AsyncJobManagerImpl.java

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,13 @@ public void reallyRun() {
969969
}
970970

971971
logger.trace("End cleanup expired async-jobs");
972+
973+
// 3) Cleanup orphaned networks stuck in Implementing state without async jobs
974+
try {
975+
cleanupOrphanedNetworks();
976+
} catch (Throwable e) {
977+
logger.error("Unexpected exception when trying to cleanup orphaned networks", e);
978+
}
972979
} catch (Throwable e) {
973980
logger.error("Unexpected exception when trying to execute queue item, ", e);
974981
}
@@ -1284,6 +1291,74 @@ private void cleanupFailedSnapshotsCreatedWithDefaultStrategy(final long msid) {
12841291
}
12851292
}
12861293

1294+
/**
1295+
* Cleanup networks that are stuck in Implementing state without associated async jobs.
1296+
* This only processes networks that have been stuck for longer than the job expiration threshold.
1297+
*/
1298+
private void cleanupOrphanedNetworks() {
1299+
try {
1300+
SearchCriteria<NetworkVO> sc = networkDao.createSearchCriteria();
1301+
sc.addAnd("state", SearchCriteria.Op.EQ, Network.State.Implementing);
1302+
sc.addAnd("removed", SearchCriteria.Op.NULL);
1303+
List<NetworkVO> implementingNetworks = networkDao.search(sc, null);
1304+
1305+
if (implementingNetworks == null || implementingNetworks.isEmpty()) {
1306+
return;
1307+
}
1308+
1309+
logger.debug("Found {} networks in Implementing state, checking for orphaned networks", implementingNetworks.size());
1310+
1311+
final long expireMinutes = JobExpireMinutes.value();
1312+
final Date cutoffTime = new Date(System.currentTimeMillis() - (expireMinutes * 60 * 1000));
1313+
1314+
for (NetworkVO network : implementingNetworks) {
1315+
if (network.getCreated().after(cutoffTime)) {
1316+
logger.trace("Network {} in Implementing state is only {} minutes old (threshold: {} minutes), skipping cleanup",
1317+
network.getId(),
1318+
(System.currentTimeMillis() - network.getCreated().getTime()) / 60000,
1319+
expireMinutes);
1320+
continue;
1321+
}
1322+
1323+
List<AsyncJobVO> jobs = _jobDao.findInstancePendingAsyncJobs("Network", network.getAccountId());
1324+
boolean hasActiveJob = false;
1325+
for (AsyncJobVO job : jobs) {
1326+
if (job.getInstanceId() != null && job.getInstanceId().equals(network.getId())) {
1327+
hasActiveJob = true;
1328+
break;
1329+
}
1330+
}
1331+
1332+
if (hasActiveJob) {
1333+
logger.debug("Network {} in Implementing state has active async job, skipping cleanup", network.getId());
1334+
continue;
1335+
}
1336+
1337+
logger.warn("Found orphaned network {} in Implementing state without async job. " +
1338+
"Network created: {}, age: {} minutes, expiration threshold: {} minutes. Transitioning to Shutdown state.",
1339+
network.getId(), network.getCreated(),
1340+
(System.currentTimeMillis() - network.getCreated().getTime()) / 60000,
1341+
expireMinutes);
1342+
updateNetworkState(network);
1343+
1344+
}
1345+
} catch (Exception e) {
1346+
logger.error("Error while cleaning up orphaned networks", e);
1347+
}
1348+
}
1349+
1350+
private void updateNetworkState(NetworkVO network) {
1351+
try {
1352+
networkOrchestrationService.stateTransitTo(network, Network.Event.OperationFailed);
1353+
logger.info("Successfully transitioned orphaned network {} to Shutdown state using state machine", network.getId());
1354+
} catch (final NoTransitionException e) {
1355+
logger.debug("State transition failed for orphaned network {}, forcing state update", network.getId());
1356+
network.setState(Network.State.Shutdown);
1357+
networkDao.update(network.getId(), network);
1358+
logger.info("Successfully forced orphaned network {} to Shutdown state", network.getId());
1359+
}
1360+
}
1361+
12871362
@Override
12881363
public void onManagementNodeJoined(List<? extends ManagementServerHost> nodeList, long selfNodeId) {
12891364
}

0 commit comments

Comments
 (0)